From 1a4d279d22e764eadbe3e042623bae5ef579c739 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Tue, 26 May 2026 11:33:57 -0700
Subject: [PATCH 01/27] Add Qwen3 AutoParallel model and examples

---
 autoparallel/_testing/models/dsv3.py       |   2 +-
 autoparallel/_testing/models/qwen3.py      | 976 +++++++++++++++++++++
 examples/example_qwen3.py                  | 242 +++++
 examples/example_sanity_check_qwen3.py     | 335 +++++++
 examples/example_sanity_check_qwen3_moe.py | 466 ++++++++++
 examples/example_torchtitan_qwen3_dense.py | 370 ++++++++
 tests/test_dsv3_torchtitan_config.py       |  35 +
 tests/test_qwen3.py                        | 323 +++++++
 8 files changed, 2748 insertions(+), 1 deletion(-)
 create mode 100644 autoparallel/_testing/models/qwen3.py
 create mode 100644 examples/example_qwen3.py
 create mode 100644 examples/example_sanity_check_qwen3.py
 create mode 100644 examples/example_sanity_check_qwen3_moe.py
 create mode 100644 examples/example_torchtitan_qwen3_dense.py
 create mode 100644 tests/test_dsv3_torchtitan_config.py
 create mode 100644 tests/test_qwen3.py

diff --git a/autoparallel/_testing/models/dsv3.py b/autoparallel/_testing/models/dsv3.py
index 5a897b71..05f78a92 100644
--- a/autoparallel/_testing/models/dsv3.py
+++ b/autoparallel/_testing/models/dsv3.py
@@ -1581,7 +1581,7 @@ def __init__(
                 route_norm=moe_cfg.router.route_norm,
                 route_scale=moe_cfg.router.route_scale,
                 score_before_experts=moe_cfg.experts.token_dispatcher.score_before_experts,
-                use_grouped_mm=moe_cfg.experts.use_grouped_mm,
+                use_grouped_mm=getattr(moe_cfg.experts, "use_grouped_mm", True),
                 load_balance_coeff=moe_cfg.load_balance_coeff,
                 mesh=mesh,
                 compute_dtype=compute_dtype,
diff --git a/autoparallel/_testing/models/qwen3.py b/autoparallel/_testing/models/qwen3.py
new file mode 100644
index 00000000..7bef8b17
--- /dev/null
+++ b/autoparallel/_testing/models/qwen3.py
@@ -0,0 +1,976 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass
+from typing import Callable, ClassVar, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.distributed.tensor import DeviceMesh
+from torch.distributed.tensor.placement_types import Partial, Replicate, Shard
+from torch.fx import traceback as fx_traceback
+from torch.nn.attention import sdpa_kernel, SDPBackend
+
+from autoparallel._testing.models.dsv3 import (
+    _permute,
+    _run_experts_for_loop,
+    _run_experts_grouped_mm,
+    _token_combine,
+)
+from autoparallel.collectives import all_to_all, axis_size, local_map
+
+
+def has_cuda_capability(major: int, minor: int) -> bool:
+    return torch.cuda.is_available() and torch.cuda.get_device_capability() >= (
+        major,
+        minor,
+    )
+
+
+class ScaledDotProductAttention(torch.nn.Module):
+    backends: ClassVar[list[SDPBackend]] = []
+
+    def __init__(self, attn_mask_type: str) -> None:
+        super().__init__()
+        if attn_mask_type != "causal":
+            raise ValueError("Qwen3 with SDPA currently only supports causal mask.")
+
+        ScaledDotProductAttention._init_backend()
+
+    @classmethod
+    def _init_backend(cls) -> None:
+        if cls.backends:
+            return
+
+        cls.backends = [
+            SDPBackend.FLASH_ATTENTION,
+            SDPBackend.EFFICIENT_ATTENTION,
+            SDPBackend.MATH,
+        ]
+        if has_cuda_capability(10, 0):
+            cls.backends.insert(0, SDPBackend.CUDNN_ATTENTION)
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        scale: float | None = None,
+    ) -> torch.Tensor:
+        assert self.backends, "SDPA backends should not be empty."
+        with sdpa_kernel(self.backends, set_priority=True):
+            return F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                is_causal=True,
+                scale=scale,
+            )
+
+
+def build_attention(attn_mask_type: str):
+    if attn_mask_type != "causal":
+        raise ValueError("Qwen3 with SDPA currently only supports causal mask.")
+    return ScaledDotProductAttention(attn_mask_type)
+
+
+@dataclass
+class Qwen3ModelArgs:
+    dim: int = 4096
+    n_layers: int = 36
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = 8
+    head_dim: int = 128
+    hidden_dim: int = 12288
+    vocab_size: int = 151936
+    norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    max_seq_len: int = 4096
+    depth_init: bool = True
+    attn_mask_type: str = "causal"
+    eos_id: int = 0
+    enable_weight_tying: bool = False
+    moe_enabled: bool = False
+    moe_hidden_dim: int = 768
+    num_experts: int = 64
+    top_k: int = 8
+    route_norm: bool = True
+    route_scale: float = 1.0
+    score_before_experts: bool = False
+    use_grouped_mm: bool = True
+    load_balance_coeff: Optional[float] = 1e-3
+    moe_axis_name: str = "ep"
+
+    def __post_init__(self) -> None:
+        n_kv_heads = self.n_heads if self.n_kv_heads is None else self.n_kv_heads
+        if self.n_heads % n_kv_heads != 0:
+            raise ValueError(
+                f"n_heads ({self.n_heads}) must be divisible by "
+                f"n_kv_heads ({n_kv_heads})."
+            )
+        if self.moe_enabled and self.top_k > self.num_experts:
+            raise ValueError(
+                f"top_k ({self.top_k}) must be <= num_experts ({self.num_experts})."
+            )
+
+    def update_from_config(self, job_config, tokenizer) -> None:
+        self.vocab_size = tokenizer.n_words
+        self.max_seq_len = job_config.training.seq_len
+        self.eos_id = tokenizer.eos_id
+
+    def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
+        nparams = sum(p.numel() for p in model.parameters())
+        nparams_embedding = sum(
+            sum(p.numel() for p in m.parameters())
+            for m in model.children()
+            if isinstance(m, nn.Embedding)
+        )
+
+        l, h, q, t = (
+            self.n_layers,
+            self.n_heads,
+            self.head_dim,
+            seq_len,
+        )
+        num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+        return nparams, num_flops_per_token
+
+
+def qwen3_args_from_torchtitan_config(config) -> Qwen3ModelArgs:
+    """Build AutoParallel Qwen3 args from TorchTitan's Qwen3Model.Config."""
+    if not config.layers:
+        raise ValueError("Qwen3 config must contain at least one layer.")
+
+    first_layer = config.layers[0]
+    attention = first_layer.attention
+    moe = first_layer.moe
+
+    if getattr(attention, "fuse_qkv", False):
+        raise ValueError("AutoParallel Qwen3 does not support fused QKV yet.")
+
+    moe_enabled = moe is not None
+    if moe_enabled:
+        hidden_dim = 0
+        moe_hidden_dim = moe.experts.hidden_dim
+        num_experts = moe.num_experts
+        top_k = moe.router.top_k
+        route_norm = moe.router.route_norm
+        route_scale = moe.router.route_scale
+        score_before_experts = moe.experts.token_dispatcher.score_before_experts
+        load_balance_coeff = moe.load_balance_coeff
+    else:
+        hidden_dim = first_layer.feed_forward.w1.out_features
+        moe_hidden_dim = 0
+        num_experts = 0
+        top_k = 1
+        route_norm = True
+        route_scale = 1.0
+        score_before_experts = False
+        load_balance_coeff = None
+
+    return Qwen3ModelArgs(
+        dim=config.dim,
+        n_layers=len(config.layers),
+        n_heads=attention.n_heads,
+        n_kv_heads=attention.n_kv_heads,
+        head_dim=attention.head_dim,
+        hidden_dim=hidden_dim,
+        vocab_size=config.vocab_size,
+        norm_eps=config.norm.eps,
+        rope_theta=config.rope.theta,
+        max_seq_len=config.rope.max_seq_len,
+        attn_mask_type=attention.mask_type,
+        enable_weight_tying=config.enable_weight_tying,
+        moe_enabled=moe_enabled,
+        moe_hidden_dim=moe_hidden_dim,
+        num_experts=num_experts,
+        top_k=top_k,
+        route_norm=route_norm,
+        route_scale=route_scale,
+        score_before_experts=score_before_experts,
+        load_balance_coeff=load_balance_coeff,
+    )
+
+
+def qwen3_debug_args(**overrides) -> Qwen3ModelArgs:
+    args = Qwen3ModelArgs(
+        dim=256,
+        n_layers=8,
+        n_heads=16,
+        n_kv_heads=8,
+        head_dim=128,
+        hidden_dim=3072,
+        vocab_size=2048,
+        max_seq_len=4096,
+        enable_weight_tying=True,
+    )
+    for key, value in overrides.items():
+        setattr(args, key, value)
+    args.__post_init__()
+    return args
+
+
+def qwen3_0_6b_args(**overrides) -> Qwen3ModelArgs:
+    args = Qwen3ModelArgs(
+        dim=1024,
+        n_layers=28,
+        n_heads=16,
+        n_kv_heads=8,
+        head_dim=128,
+        hidden_dim=3072,
+        vocab_size=151936,
+        enable_weight_tying=True,
+    )
+    for key, value in overrides.items():
+        setattr(args, key, value)
+    args.__post_init__()
+    return args
+
+
+def qwen3_1_7b_args(**overrides) -> Qwen3ModelArgs:
+    args = Qwen3ModelArgs(
+        dim=2048,
+        n_layers=28,
+        n_heads=16,
+        n_kv_heads=8,
+        head_dim=128,
+        hidden_dim=6144,
+        vocab_size=151936,
+        enable_weight_tying=True,
+    )
+    for key, value in overrides.items():
+        setattr(args, key, value)
+    args.__post_init__()
+    return args
+
+
+def qwen3_4b_args(**overrides) -> Qwen3ModelArgs:
+    args = Qwen3ModelArgs(
+        dim=2560,
+        n_layers=36,
+        n_heads=32,
+        n_kv_heads=8,
+        head_dim=128,
+        hidden_dim=9728,
+        vocab_size=151936,
+        enable_weight_tying=True,
+    )
+    for key, value in overrides.items():
+        setattr(args, key, value)
+    args.__post_init__()
+    return args
+
+
+def qwen3_8b_args(**overrides) -> Qwen3ModelArgs:
+    args = Qwen3ModelArgs()
+    for key, value in overrides.items():
+        setattr(args, key, value)
+    args.__post_init__()
+    return args
+
+
+def qwen3_moe_debug_args(**overrides) -> Qwen3ModelArgs:
+    args = Qwen3ModelArgs(
+        dim=256,
+        n_layers=8,
+        n_heads=16,
+        n_kv_heads=8,
+        head_dim=128,
+        hidden_dim=3072,
+        vocab_size=2048,
+        max_seq_len=4096,
+        moe_enabled=True,
+        moe_hidden_dim=768,
+        num_experts=64,
+        top_k=8,
+        route_norm=True,
+        score_before_experts=False,
+    )
+    for key, value in overrides.items():
+        setattr(args, key, value)
+    args.__post_init__()
+    return args
+
+
+def qwen3_30b_a3b_args(**overrides) -> Qwen3ModelArgs:
+    args = Qwen3ModelArgs(
+        dim=2048,
+        n_layers=48,
+        n_heads=32,
+        n_kv_heads=4,
+        head_dim=128,
+        hidden_dim=6144,
+        vocab_size=151936,
+        max_seq_len=262144,
+        moe_enabled=True,
+        moe_hidden_dim=768,
+        num_experts=128,
+        top_k=8,
+        route_norm=True,
+        score_before_experts=False,
+    )
+    for key, value in overrides.items():
+        setattr(args, key, value)
+    args.__post_init__()
+    return args
+
+
+def qwen3_235b_a22b_args(**overrides) -> Qwen3ModelArgs:
+    args = Qwen3ModelArgs(
+        dim=4096,
+        n_layers=94,
+        n_heads=64,
+        n_kv_heads=4,
+        head_dim=128,
+        hidden_dim=12288,
+        vocab_size=151936,
+        max_seq_len=4096,
+        rope_theta=5000000.0,
+        moe_enabled=True,
+        moe_hidden_dim=1536,
+        num_experts=128,
+        top_k=8,
+        route_norm=True,
+        score_before_experts=False,
+    )
+    for key, value in overrides.items():
+        setattr(args, key, value)
+    args.__post_init__()
+    return args
+
+
+def precompute_freqs_cos_sin(
+    dim: int,
+    max_seq_len: int,
+    theta: float = 1000000.0,
+) -> torch.Tensor:
+    freq = theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+    inv_freq = 1.0 / freq
+    t = torch.arange(max_seq_len, dtype=inv_freq.dtype, device=inv_freq.device)
+    freqs = torch.outer(t, inv_freq).float()
+    freqs = torch.cat([freqs, freqs], dim=-1)
+    cos = freqs.cos()
+    sin = freqs.sin()
+    return torch.cat([cos, sin], dim=-1)
+
+
+def reshape_for_broadcast_cos_sin(
+    rope_cache: torch.Tensor,
+    x: torch.Tensor,
+) -> torch.Tensor:
+    bsz, seqlen, _, head_dim = x.shape
+    rope_cache = rope_cache[0:seqlen]
+    assert rope_cache.shape == (seqlen, head_dim * 2)
+    return rope_cache.view(1, seqlen, 1, head_dim * 2).expand(bsz, -1, -1, -1)
+
+
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_emb_cos_sin(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    rope_cache: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    head_dim = xq.shape[-1]
+    rope_cache = reshape_for_broadcast_cos_sin(rope_cache, xq)
+    cos = rope_cache[..., :head_dim].to(device=xq.device)
+    sin = rope_cache[..., head_dim:].to(device=xq.device)
+    xq_f = xq.float()
+    xk_f = xk.float()
+    xq_out = (xq_f * cos) + (_rotate_half(xq_f) * sin)
+    xk_out = (xk_f * cos) + (_rotate_half(xk_f) * sin)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        torch.unsqueeze(x, dim=3)
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+
+
+def _to_activation_device(tensor: torch.Tensor, activation: torch.Tensor) -> torch.Tensor:
+    if tensor.device != activation.device and tensor.device.type == "meta":
+        return tensor.to(activation.device)
+    return tensor
+
+
+def _rms_norm(x: torch.Tensor, norm: nn.RMSNorm) -> torch.Tensor:
+    weight = (
+        _to_activation_device(norm.weight, x)
+        if norm.weight is not None
+        else None
+    )
+    if weight is not None and weight.dtype != x.dtype:
+        weight = weight.to(dtype=x.dtype)
+    return F.rms_norm(x, norm.normalized_shape, weight, norm.eps).to(dtype=x.dtype)
+
+
+def _linear(x: torch.Tensor, linear: nn.Linear) -> torch.Tensor:
+    weight = _to_activation_device(linear.weight, x)
+    bias = (
+        _to_activation_device(linear.bias, x)
+        if linear.bias is not None
+        else None
+    )
+    if weight.dtype != x.dtype:
+        weight = weight.to(dtype=x.dtype)
+    if bias is not None and bias.dtype != x.dtype:
+        bias = bias.to(dtype=x.dtype)
+    return F.linear(x, weight, bias)
+
+
+class Attention(nn.Module):
+    def __init__(self, model_args: Qwen3ModelArgs):
+        super().__init__()
+        self.n_heads = model_args.n_heads
+        self.n_kv_heads = (
+            model_args.n_heads
+            if model_args.n_kv_heads is None
+            else model_args.n_kv_heads
+        )
+        self.n_rep = self.n_heads // self.n_kv_heads
+        self.head_dim = model_args.head_dim
+        self.scale = self.head_dim**-0.5
+
+        self.wq = nn.Linear(
+            model_args.dim, model_args.n_heads * self.head_dim, bias=False
+        )
+        self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(
+            model_args.n_heads * self.head_dim, model_args.dim, bias=False
+        )
+        self.q_norm = nn.RMSNorm(self.head_dim, eps=model_args.norm_eps)
+        self.k_norm = nn.RMSNorm(self.head_dim, eps=model_args.norm_eps)
+        self.sdpa = build_attention(model_args.attn_mask_type)
+
+    def init_weights(self, init_std: float):
+        for linear in (self.wq, self.wk, self.wv):
+            nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std)
+        self.q_norm.reset_parameters()
+        self.k_norm.reset_parameters()
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos_sin: torch.Tensor,
+    ):
+        bs, seqlen, _ = x.shape
+        xq, xk, xv = _linear(x, self.wq), _linear(x, self.wk), _linear(x, self.wv)
+
+        xq = xq.view(bs, seqlen, -1, self.head_dim)
+        xk = xk.view(bs, seqlen, -1, self.head_dim)
+        xv = xv.view(bs, seqlen, -1, self.head_dim)
+
+        xq = _rms_norm(xq, self.q_norm)
+        xk = _rms_norm(xk, self.k_norm)
+        freqs_cos_sin = _to_activation_device(freqs_cos_sin, xq)
+        xq, xk = apply_rotary_emb_cos_sin(xq, xk, freqs_cos_sin)
+
+        keys = repeat_kv(xk, self.n_rep)
+        values = repeat_kv(xv, self.n_rep)
+
+        xq = xq.transpose(1, 2)
+        xk = keys.transpose(1, 2)
+        xv = values.transpose(1, 2)
+
+        output = self.sdpa(xq, xk, xv, scale=self.scale)
+
+        output = output.transpose(1, 2).contiguous()
+        output = output.view(bs, seqlen, -1)
+        return _linear(output, self.wo)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+
+    def forward(self, x):
+        return _linear(F.silu(_linear(x, self.w1)) * _linear(x, self.w3), self.w2)
+
+    def init_weights(self, init_std: float):
+        nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
+        for linear in (self.w2, self.w3):
+            nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std)
+
+
+class GroupedExperts(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        num_experts: int,
+        use_grouped_mm: bool,
+    ):
+        super().__init__()
+        self.num_experts = num_experts
+        self.w1 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
+        self.w2 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+        self.w3 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
+        self.use_grouped_mm = use_grouped_mm
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        num_tokens_per_expert: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.use_grouped_mm:
+            return _run_experts_grouped_mm(
+                self.w1, self.w2, self.w3, x, num_tokens_per_expert
+            )
+        return _run_experts_for_loop(
+            self.w1, self.w2, self.w3, x, num_tokens_per_expert
+        )
+
+    def init_weights(self, init_std: float):
+        nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std)
+        nn.init.trunc_normal_(self.w3, mean=0.0, std=init_std)
+
+
+def _qwen3_token_dispatch(routed_input, num_tokens_per_expert, axis_name):
+    ep_size = axis_size(axis_name)
+    num_tokens_per_expert_group = all_to_all(
+        num_tokens_per_expert,
+        None,
+        None,
+        axis_name,
+    )
+
+    with torch.no_grad():
+        input_splits = (
+            num_tokens_per_expert.view(ep_size, -1)
+            .sum(dim=1)
+            .to(torch.device("cpu"), non_blocking=True)
+        )
+        output_splits = (
+            num_tokens_per_expert_group.view(ep_size, -1)
+            .sum(dim=1)
+            .to(torch.device("cpu"), non_blocking=False)
+        )
+        input_splits = input_splits.tolist()
+        output_splits = output_splits.tolist()
+
+    with fx_traceback.annotate({"comm_region": "token_dispatch"}):
+        routed_input = all_to_all(
+            routed_input,
+            output_splits,
+            input_splits,
+            axis_name,
+        )
+
+    num_local_experts = num_tokens_per_expert_group.shape[0] // ep_size
+    return (
+        *_permute(
+            routed_input,
+            num_tokens_per_expert_group,
+            ep_size,
+            num_local_experts,
+        ),
+        input_splits,
+        output_splits,
+    )
+
+
+def qwen3_moe_local_mapped_region(
+    x: torch.Tensor,
+    selected_experts_indices: torch.Tensor,
+    top_scores: torch.Tensor,
+    experts_w1: torch.Tensor,
+    experts_w3: torch.Tensor,
+    experts_w2: torch.Tensor,
+    out: torch.Tensor,
+    top_k: int,
+    num_experts: int,
+    score_before_experts: bool,
+    axis_name: str,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    dim = x.shape[-1]
+    ep_size = axis_size(axis_name)
+    if num_experts % ep_size != 0:
+        raise ValueError(
+            f"num_experts ({num_experts}) must be divisible by "
+            f"axis_size({axis_name!r}) ({ep_size})."
+        )
+
+    num_tokens_per_expert = torch.histc(
+        selected_experts_indices.flatten(),
+        bins=num_experts,
+        min=0,
+        max=num_experts,
+    ).view(-1)
+
+    token_indices_experts_sorted = torch.argsort(
+        selected_experts_indices.view(-1), stable=True
+    )
+    top_scores_experts_sorted = top_scores.view(-1)[token_indices_experts_sorted]
+    token_indices_experts_sorted = token_indices_experts_sorted // top_k
+
+    routed_input = x[token_indices_experts_sorted]
+    if score_before_experts:
+        routed_input = (
+            routed_input.to(torch.float32) * top_scores_experts_sorted.reshape(-1, 1)
+        ).to(x.dtype)
+
+    shape = routed_input.shape
+    (
+        input_shape,
+        routed_input,
+        permuted_indices,
+        num_tokens_per_expert_group,
+        input_splits,
+        output_splits,
+    ) = _qwen3_token_dispatch(routed_input, num_tokens_per_expert, axis_name)
+
+    routed_output = _run_experts_grouped_mm(
+        experts_w1,
+        experts_w2,
+        experts_w3,
+        routed_input,
+        num_tokens_per_expert_group,
+    )
+    routed_output = _token_combine(
+        routed_output,
+        input_shape,
+        permuted_indices,
+        input_splits,
+        output_splits,
+        axis_name,
+    )
+
+    torch._check(routed_output.shape[0] == shape[0])
+    if not score_before_experts:
+        routed_output = (
+            routed_output.to(torch.float32) * top_scores_experts_sorted.reshape(-1, 1)
+        ).to(routed_output.dtype)
+
+    out = out.scatter_add(
+        dim=0,
+        index=token_indices_experts_sorted.reshape(-1, 1).expand(-1, dim),
+        src=routed_output,
+    )
+    return out, num_tokens_per_expert
+
+class MoE(nn.Module):
+    def __init__(
+        self,
+        model_args: Qwen3ModelArgs,
+        mesh: DeviceMesh | None = None,
+        axis_name: str | None = None,
+    ):
+        super().__init__()
+        self.mesh = mesh
+        self.axis_name = axis_name or model_args.moe_axis_name
+        self.num_experts = model_args.num_experts
+        self.top_k = model_args.top_k
+        self.route_norm = model_args.route_norm
+        self.route_scale = model_args.route_scale
+        self.score_before_experts = model_args.score_before_experts
+        self.load_balance_coeff = model_args.load_balance_coeff
+
+        self.router = nn.Linear(model_args.dim, model_args.num_experts, bias=False)
+        self.experts = GroupedExperts(
+            dim=model_args.dim,
+            hidden_dim=model_args.moe_hidden_dim,
+            num_experts=model_args.num_experts,
+            use_grouped_mm=model_args.use_grouped_mm,
+        )
+        self.register_buffer(
+            "expert_bias",
+            torch.zeros(model_args.num_experts, dtype=torch.float32),
+            persistent=self.load_balance_coeff is not None,
+        )
+        self.register_buffer(
+            "tokens_per_expert",
+            torch.zeros(model_args.num_experts, dtype=torch.float32),
+            persistent=False,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bs, slen, dim = x.shape
+        x = x.view(-1, dim)
+        experts_w1, experts_w2, experts_w3 = self.experts.parameters()
+        experts_w1 = _to_activation_device(experts_w1, x)
+        experts_w2 = _to_activation_device(experts_w2, x)
+        experts_w3 = _to_activation_device(experts_w3, x)
+
+        scores = F.linear(
+            x.to(torch.float32),
+            _to_activation_device(self.router.weight, x).to(torch.float32),
+            None,
+        )
+        scores = F.softmax(scores, dim=-1)
+        expert_bias = _to_activation_device(self.expert_bias, scores)
+        scores_for_choice = (
+            scores + expert_bias
+            if self.load_balance_coeff is not None
+            else scores
+        )
+        _, selected_experts_indices = torch.topk(
+            scores_for_choice,
+            k=self.top_k,
+            dim=-1,
+            sorted=False,
+        )
+
+        top_scores = scores.gather(dim=-1, index=selected_experts_indices)
+        if self.route_norm:
+            denominator = top_scores.sum(dim=-1, keepdim=True) + 1e-20
+            top_scores = top_scores / denominator
+        top_scores = top_scores * self.route_scale
+
+        # Qwen3 MoE has no shared expert path, but keeping the initial output
+        # differentiably tied to x matches the DSv3 local_map autograd shape.
+        out = x * 0
+        out, num_tokens_per_expert = local_map(
+            qwen3_moe_local_mapped_region,
+            out_placements=(
+                (Shard(0), Shard(0)),
+                (Partial(reduce_op="sum"), Partial(reduce_op="sum")),
+            ),
+            in_placements=(
+                (Shard(0), Shard(0)),
+                (Shard(0), Shard(0)),
+                (Shard(0), Shard(0)),
+                (Replicate(), Shard(0)),
+                (Replicate(), Shard(0)),
+                (Replicate(), Shard(0)),
+                (Shard(0), Shard(0)),
+                None,
+                None,
+                None,
+                None,
+            ),
+            redistribute_inputs=True,
+            in_grad_placements=None,
+            device_mesh=self.mesh,
+        )(
+            x,
+            selected_experts_indices,
+            top_scores,
+            experts_w1,
+            experts_w3,
+            experts_w2,
+            out,
+            self.top_k,
+            self.num_experts,
+            self.score_before_experts,
+            self.axis_name,
+        )
+        # This counter is only used for runtime load-balance diagnostics. During
+        # AutoParallel graph capture the module buffers are fake/meta tensors
+        # while the traced local_map output can be CUDA-fake, and recording this
+        # mutation is not needed for the solved training graph.
+        if not torch.compiler.is_compiling():
+            with torch.no_grad():
+                self.tokens_per_expert.add_(num_tokens_per_expert)  # type: ignore[operator]
+        return out.reshape(bs, slen, dim)
+
+    def init_weights(
+        self,
+        init_std: float,
+        buffer_device: torch.device,
+    ):
+        nn.init.trunc_normal_(self.router.weight, mean=0.0, std=init_std)
+        self.experts.init_weights(init_std)
+        with torch.device(buffer_device):
+            self.tokens_per_expert.zero_()  # type: ignore[operator]
+            self.expert_bias.zero_()  # type: ignore[operator]
+
+
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        model_args: Qwen3ModelArgs,
+        mesh: DeviceMesh | None = None,
+        moe_axis_name: str | None = None,
+    ):
+        super().__init__()
+        self.attention = Attention(model_args)
+        self.moe_enabled = model_args.moe_enabled
+        if self.moe_enabled:
+            self.moe = MoE(model_args, mesh=mesh, axis_name=moe_axis_name)
+        else:
+            self.feed_forward = FeedForward(
+                dim=model_args.dim,
+                hidden_dim=model_args.hidden_dim,
+            )
+        self.attention_norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
+        self.ffn_norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
+
+        if model_args.depth_init:
+            self.weight_init_std = 0.02 / math.sqrt(2 * (layer_id + 1))
+        else:
+            self.weight_init_std = 0.02 / math.sqrt(2 * model_args.n_layers)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos_sin: torch.Tensor,
+    ):
+        h = x + self.attention(_rms_norm(x, self.attention_norm), freqs_cos_sin)
+        if self.moe_enabled:
+            out = h + self.moe(_rms_norm(h, self.ffn_norm))
+        else:
+            out = h + self.feed_forward(_rms_norm(h, self.ffn_norm))
+        return out
+
+    def init_weights(self, buffer_device: torch.device):
+        for norm in (self.attention_norm, self.ffn_norm):
+            norm.reset_parameters()
+        self.attention.init_weights(self.weight_init_std)
+        if self.moe_enabled:
+            self.moe.init_weights(self.weight_init_std, buffer_device)
+        else:
+            self.feed_forward.init_weights(self.weight_init_std)
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        model_args: Qwen3ModelArgs,
+        mesh: DeviceMesh | None = None,
+        moe_axis_name: str | None = None,
+    ):
+        super().__init__()
+        self.model_args = model_args
+        self.vocab_size = model_args.vocab_size
+        self.n_layers = model_args.n_layers
+        self.eos_id = model_args.eos_id
+        self.enable_weight_tying = model_args.enable_weight_tying
+        self.mesh = mesh
+        self.moe_axis_name = moe_axis_name or model_args.moe_axis_name
+
+        self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
+        self.register_buffer(
+            "freqs_cos_sin",
+            self._precompute_freqs_cos_sin(),
+            persistent=True,
+        )
+
+        self.layers = torch.nn.ModuleDict()
+        for layer_id in range(model_args.n_layers):
+            self.layers[str(layer_id)] = TransformerBlock(
+                layer_id,
+                model_args,
+                mesh=mesh,
+                moe_axis_name=self.moe_axis_name,
+            )
+        self.norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
+        self.lm_head = nn.Linear(model_args.dim, model_args.vocab_size, bias=False)
+
+        if self.enable_weight_tying:
+            self.tok_embeddings.weight = self.lm_head.weight
+
+    def init_weights(
+        self,
+        buffer_device: Optional[torch.device] = None,
+        seed: int | None = None,
+    ):
+        if seed is not None:
+            torch.manual_seed(seed)
+
+        if self.enable_weight_tying:
+            self.tok_embeddings.weight = self.lm_head.weight
+
+        buffer_device = buffer_device or self.freqs_cos_sin.device  # type: ignore[assignment]
+        with torch.device(buffer_device):  # type: ignore[arg-type]
+            self.freqs_cos_sin = self._precompute_freqs_cos_sin()
+
+        if not self.enable_weight_tying and self.tok_embeddings is not None:
+            nn.init.normal_(self.tok_embeddings.weight)
+        for layer in self.layers.values():
+            if layer is not None:
+                layer.init_weights(buffer_device)  # type: ignore[operator]
+        if self.norm is not None:
+            self.norm.reset_parameters()
+
+        final_out_std = self.model_args.dim**-0.5
+        cutoff_factor = 3
+        if self.lm_head is not None:
+            nn.init.trunc_normal_(
+                self.lm_head.weight,
+                mean=0.0,
+                std=final_out_std,
+                a=-cutoff_factor * final_out_std,
+                b=cutoff_factor * final_out_std,
+            )
+
+        if self.enable_weight_tying:
+            self.tok_embeddings.weight = self.lm_head.weight
+
+    def _precompute_freqs_cos_sin(self) -> torch.Tensor:
+        return precompute_freqs_cos_sin(
+            self.model_args.head_dim,
+            self.model_args.max_seq_len,
+            self.model_args.rope_theta,
+        )
+
+    def _token_embedding(self, tokens: torch.Tensor) -> torch.Tensor:
+        weight = self.tok_embeddings.weight
+        if weight.device != tokens.device and weight.device.type == "meta":
+            weight = weight.to(tokens.device)
+        return F.embedding(tokens, weight)
+
+    def forward(self, tokens: torch.Tensor, input_batch: Optional[torch.Tensor] = None):
+        h = self._token_embedding(tokens) if self.tok_embeddings is not None else tokens
+
+        for layer in self.layers.values():
+            h = layer(h, self.freqs_cos_sin)
+
+        h = _rms_norm(h, self.norm) if self.norm is not None else h
+        output = _linear(h, self.lm_head) if self.lm_head is not None else h
+        return output
+
+
+_MODULE_FQN = "module_fqn"
+
+
+def _annotate_once(fn: Callable, meta: dict):
+    if getattr(fn, "_graph_trainer_annotated", False):
+        return fn
+    wrapped = fx_traceback.annotate_fn(meta)(fn)
+    setattr(wrapped, "_graph_trainer_annotated", True)
+    return wrapped
+
+
+def _annotate_module_fqns(model: nn.Module) -> None:
+    for fqn, submodule in model.named_modules():
+        if fqn:
+            submodule.forward = _annotate_once(
+                submodule.forward,
+                {_MODULE_FQN: fqn},
+            )
+
+
+def annotate_qwen3_for_graph_trainer(model: Transformer) -> None:
+    """Attach graph_trainer-compatible FX annotations to AP's Qwen3 model."""
+    global qwen3_moe_local_mapped_region
+
+    qwen3_moe_local_mapped_region = _annotate_once(
+        qwen3_moe_local_mapped_region,
+        {"EP": "compute"},
+    )
+    MoE.forward = _annotate_once(  # type: ignore[method-assign]
+        MoE.forward,
+        {"EP": "compute"},
+    )
+    _annotate_module_fqns(model)
diff --git a/examples/example_qwen3.py b/examples/example_qwen3.py
new file mode 100644
index 00000000..2ae57b00
--- /dev/null
+++ b/examples/example_qwen3.py
@@ -0,0 +1,242 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import time
+
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.qwen3 import (
+    Qwen3ModelArgs,
+    Transformer,
+    qwen3_235b_a22b_args,
+    qwen3_30b_a3b_args,
+    qwen3_8b_args,
+    qwen3_debug_args,
+    qwen3_moe_debug_args,
+)
+from autoparallel.api import AutoParallel
+from autoparallel.compile import autoparallel_backend
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Trace, optimize, and smoke-test dense Qwen3 with AutoParallel."
+    )
+    parser.add_argument(
+        "--flavor",
+        choices=("tiny", "moe-tiny", "debug", "8b", "moe-debug", "30b-a3b", "235b-a22b"),
+        default="tiny",
+        help="Qwen3 model size to instantiate. Defaults to tiny for faster runs.",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=None,
+        help="Sequence length. Defaults to 8 for tiny, 512 for debug, and 4096 for 8b.",
+    )
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=64,
+        help="Fake process-group world size.",
+    )
+    parser.add_argument(
+        "--tp-degree",
+        type=int,
+        default=8,
+        help="Second mesh degree. Used as TP for dense flavors and EP for MoE flavors.",
+    )
+    parser.add_argument(
+        "--local-batch-size",
+        type=int,
+        default=2,
+        help="Per-DP-rank batch size used for the runtime smoke pass.",
+    )
+    parser.add_argument(
+        "--save-optimizer",
+        type=str,
+        default=None,
+        help="Optional path for the serialized sharding optimizer state.",
+    )
+    parser.add_argument(
+        "--compile",
+        action="store_true",
+        help="Compile the placed module with the AutoParallel backend before running.",
+    )
+    parser.add_argument(
+        "--skip-run",
+        action="store_true",
+        help="Only run tracing, optimization, and placement application.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print the full AutoParallel optimizer log.",
+    )
+    return parser.parse_args()
+
+
+def make_model_args(flavor: str, seq_len: int):
+    if flavor == "tiny":
+        return Qwen3ModelArgs(
+            dim=64,
+            n_layers=2,
+            n_heads=4,
+            n_kv_heads=2,
+            head_dim=16,
+            hidden_dim=128,
+            vocab_size=128,
+            max_seq_len=seq_len,
+        )
+    if flavor == "moe-tiny":
+        return Qwen3ModelArgs(
+            dim=64,
+            n_layers=1,
+            n_heads=4,
+            n_kv_heads=2,
+            head_dim=16,
+            hidden_dim=128,
+            vocab_size=128,
+            max_seq_len=seq_len,
+            moe_enabled=True,
+            moe_hidden_dim=32,
+            num_experts=8,
+            top_k=2,
+            route_norm=True,
+            score_before_experts=False,
+        )
+    if flavor == "debug":
+        return qwen3_debug_args(max_seq_len=seq_len)
+    if flavor == "8b":
+        return qwen3_8b_args(max_seq_len=seq_len)
+    if flavor == "moe-debug":
+        return qwen3_moe_debug_args(max_seq_len=seq_len)
+    if flavor == "30b-a3b":
+        return qwen3_30b_a3b_args(max_seq_len=seq_len)
+    if flavor == "235b-a22b":
+        return qwen3_235b_a22b_args(max_seq_len=seq_len)
+    raise ValueError(f"Unknown Qwen3 flavor: {flavor}")
+
+
+def main():
+    args = parse_args()
+    logging.basicConfig(level=logging.DEBUG)
+
+    seq_len = args.seq_len
+    if seq_len is None:
+        seq_len = {
+            "tiny": 8,
+            "moe-tiny": 8,
+            "debug": 512,
+            "8b": 4096,
+            "moe-debug": 512,
+            "30b-a3b": 4096,
+            "235b-a22b": 4096,
+        }[args.flavor]
+    if args.world_size % args.tp_degree != 0:
+        raise ValueError(
+            f"world-size ({args.world_size}) must be divisible by "
+            f"tp-degree ({args.tp_degree})."
+        )
+
+    if not torch.distributed.is_initialized():
+        fake_store = FakeStore()
+        torch.distributed.init_process_group(
+            "fake",
+            store=fake_store,
+            rank=0,
+            world_size=args.world_size,
+        )
+
+    model_args = make_model_args(args.flavor, seq_len)
+    mesh_dim_names = ("dp", "ep") if model_args.moe_enabled else ("dp", "tp")
+    mesh = torch.distributed.device_mesh.init_device_mesh(
+        "cuda",
+        (args.world_size // args.tp_degree, args.tp_degree),
+        mesh_dim_names=mesh_dim_names,
+    )
+    device = torch.device("cuda")
+
+    global_batch_size = args.local_batch_size * mesh.shape[0]
+    if model_args.moe_enabled:
+        global_batch_size *= mesh.shape[1]
+
+    with torch.device("meta"):
+        model = Transformer(
+            model_args,
+            mesh=mesh if model_args.moe_enabled else None,
+            moe_axis_name=mesh.mesh_dim_names[1],
+        )
+
+    def input_fn():
+        return torch.randint(
+            0,
+            model_args.vocab_size,
+            (global_batch_size, seq_len),
+            device=device,
+        )
+
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.float32,
+    )
+
+    t0 = time.time()
+    with AutoParallel(
+        model,
+        input_fn,
+        mesh,
+        mp_policy,
+        dynamic=model_args.moe_enabled,
+        repeated_subgraphs=True,
+    ) as autop:
+        autop.add_parameter_memory_constraint(low=None, high=None)
+
+        x_sharding = (Shard(0), Shard(0)) if model_args.moe_enabled else (Shard(0), Replicate())
+        out_sharding = (Shard(0), Shard(2))
+        autop.add_input_constraints([x_sharding])
+        autop.add_output_constraints([out_sharding])
+
+        sharding_placement = autop.optimize_placement(verbose=args.verbose)
+        print(f"Tracing + optimization took {time.time() - t0:.1f}s")
+
+        if args.save_optimizer is not None:
+            autop.sharding_optimizer.save(args.save_optimizer)
+            autop.sharding_optimizer.save_placements(
+                f"{args.save_optimizer}.placements.json"
+            )
+
+        parallel_mod = autop.apply_placement(sharding_placement)
+
+    if args.skip_run:
+        print("Placement applied successfully.")
+        return
+
+    parallel_mod.to_empty(device=device)
+    parallel_mod.init_weights(buffer_device=device)  # type: ignore[operator]
+
+    if args.compile:
+        parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend())
+
+    tokens = torch.randint(
+        0,
+        model_args.vocab_size,
+        (args.local_batch_size, seq_len),
+        device=device,
+    )
+    out = parallel_mod(tokens)
+    if torch.any(torch.isnan(out)):
+        raise RuntimeError("Found NaNs in Qwen3 forward output.")
+    out.backward(torch.randn_like(out))
+    print("All good!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/example_sanity_check_qwen3.py b/examples/example_sanity_check_qwen3.py
new file mode 100644
index 00000000..b7af6c0d
--- /dev/null
+++ b/examples/example_sanity_check_qwen3.py
@@ -0,0 +1,335 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+import time
+
+import torch
+import torch.distributed as dist
+import torch.distributed.nn.functional as dist_nn_func
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+
+from autoparallel._testing.models.qwen3 import Transformer, qwen3_8b_args
+from autoparallel.api import AutoParallel
+from autoparallel.compile import autoparallel_backend
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Run a real Qwen3 8B AutoParallel training sanity check."
+    )
+    parser.add_argument(
+        "--global-batch-size",
+        type=int,
+        default=16,
+        help="Global batch size across data-parallel ranks.",
+    )
+    parser.add_argument(
+        "--microbatch-size",
+        type=int,
+        default=1,
+        help="Per-DP-rank microbatch size for gradient accumulation.",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=4096,
+        help="Sequence length. Defaults to Qwen3 8B's max sequence length.",
+    )
+    parser.add_argument(
+        "--dp-degree",
+        type=int,
+        default=2,
+        help="Data-parallel mesh degree.",
+    )
+    parser.add_argument(
+        "--tp-degree",
+        type=int,
+        default=2,
+        help="Tensor-parallel mesh degree.",
+    )
+    parser.add_argument(
+        "--train-steps",
+        type=int,
+        default=20,
+        help="Number of optimizer steps.",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=3e-4,
+        help="AdamW learning rate.",
+    )
+    parser.add_argument(
+        "--max-grad-norm",
+        type=float,
+        default=1.0,
+        help="Gradient clipping max norm.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Seed for model initialization and synthetic data generation.",
+    )
+    parser.add_argument(
+        "--compile",
+        action="store_true",
+        help="Compile the placed module with the AutoParallel backend before training.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print the full AutoParallel optimizer log.",
+    )
+    return parser.parse_args()
+
+
+def init_distributed(args):
+    if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ:
+        raise RuntimeError(
+            "Run this example with torchrun, e.g. "
+            "torchrun --standalone --nproc-per-node 4 "
+            "examples/example_sanity_check_qwen3.py"
+        )
+
+    world_size = int(os.environ["WORLD_SIZE"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+    expected_world_size = args.dp_degree * args.tp_degree
+    if world_size != expected_world_size:
+        raise ValueError(
+            f"WORLD_SIZE ({world_size}) must equal dp-degree * tp-degree "
+            f"({args.dp_degree} * {args.tp_degree} = {expected_world_size})."
+        )
+    if args.global_batch_size % args.dp_degree != 0:
+        raise ValueError(
+            f"global-batch-size ({args.global_batch_size}) must be divisible by "
+            f"dp-degree ({args.dp_degree})."
+        )
+    local_batch_size = args.global_batch_size // args.dp_degree
+    if local_batch_size % args.microbatch_size != 0:
+        raise ValueError(
+            f"local batch size ({local_batch_size}) must be divisible by "
+            f"microbatch-size ({args.microbatch_size})."
+        )
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    dist.init_process_group("nccl", device_id=device)
+    mesh = torch.distributed.device_mesh.init_device_mesh(
+        "cuda",
+        (args.dp_degree, args.tp_degree),
+        mesh_dim_names=("dp", "tp"),
+    )
+    return device, mesh
+
+
+def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor:
+    coordinate = mesh.get_coordinate()
+    if coordinate is None:
+        raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.")
+    dp_rank, _tp_rank = coordinate
+    local_batch_size = args.global_batch_size // args.dp_degree
+
+    generator = torch.Generator(device="cpu")
+    generator.manual_seed(args.seed)
+    tokens = torch.randint(
+        0,
+        vocab_size,
+        (args.global_batch_size, args.seq_len + 1),
+        generator=generator,
+        dtype=torch.long,
+    )
+
+    start = dp_rank * local_batch_size
+    stop = start + local_batch_size
+    return tokens[start:stop].to(device, non_blocking=True)
+
+
+def vocab_parallel_cross_entropy(
+    logits: torch.Tensor,
+    labels: torch.Tensor,
+    *,
+    vocab_size: int,
+    tp_group,
+    tp_rank: int,
+    tp_degree: int,
+    global_token_count: int,
+) -> torch.Tensor:
+    if logits.shape[:2] != labels.shape:
+        raise ValueError(
+            f"logits shape {tuple(logits.shape)} is incompatible with "
+            f"labels shape {tuple(labels.shape)}."
+        )
+
+    local_vocab_size = logits.shape[-1]
+    vocab_start = tp_rank * local_vocab_size
+    vocab_stop = vocab_start + local_vocab_size
+    if tp_rank == tp_degree - 1:
+        vocab_stop = vocab_size
+
+    logits = logits.float()
+    local_max = logits.amax(dim=-1)
+    with torch.no_grad():
+        global_max = local_max.detach().clone()
+        dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group)
+
+    shifted_logits = logits - global_max.unsqueeze(-1)
+    local_exp_sum = shifted_logits.exp().sum(dim=-1)
+    global_exp_sum = dist_nn_func.all_reduce(
+        local_exp_sum,
+        op=dist.ReduceOp.SUM,
+        group=tp_group,
+    )
+
+    target_mask = (labels >= vocab_start) & (labels < vocab_stop)
+    local_target = torch.zeros_like(labels, dtype=torch.long)
+    local_target[target_mask] = labels[target_mask] - vocab_start
+    local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1)
+    local_target_logits = local_target_logits * target_mask.to(logits.dtype)
+    target_logits = dist_nn_func.all_reduce(
+        local_target_logits,
+        op=dist.ReduceOp.SUM,
+        group=tp_group,
+    )
+
+    loss_sum = (global_exp_sum.log() + global_max - target_logits).sum()
+    return loss_sum / (global_token_count * tp_degree)
+
+
+def print_rank0(message: str) -> None:
+    if dist.get_rank() == 0:
+        print(message, flush=True)
+
+
+def main():
+    args = parse_args()
+    logging.basicConfig(level=logging.DEBUG)
+
+    device, mesh = init_distributed(args)
+    tp_group = mesh.get_group("tp")
+    tp_rank = mesh.get_local_rank("tp")
+    local_batch_size = args.global_batch_size // args.dp_degree
+    gradient_accumulation_steps = local_batch_size // args.microbatch_size
+
+    torch.manual_seed(args.seed)
+    model_args = qwen3_8b_args(max_seq_len=args.seq_len)
+    trace_global_batch_size = args.microbatch_size * args.dp_degree
+
+    with torch.device("meta"):
+        model = Transformer(model_args)
+
+    def input_fn():
+        return torch.randint(
+            0,
+            model_args.vocab_size,
+            (trace_global_batch_size, args.seq_len),
+            device=device,
+        )
+
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.float32,
+    )
+
+    print_rank0(
+        "Qwen3 8B sanity check: "
+        f"mesh=(dp={args.dp_degree}, tp={args.tp_degree}), "
+        f"global_batch={args.global_batch_size}, "
+        f"local_batch={local_batch_size}, "
+        f"microbatch={args.microbatch_size}, "
+        f"grad_accum={gradient_accumulation_steps}, "
+        f"trace_global_batch={trace_global_batch_size}, "
+        f"seq_len={args.seq_len}"
+    )
+
+    t0 = time.time()
+    with AutoParallel(
+        model,
+        input_fn,
+        mesh,
+        mp_policy,
+        repeated_subgraphs=True,
+    ) as autop:
+        autop.add_parameter_memory_constraint(low=None, high=None)
+        autop.add_input_constraints([(Shard(0), Replicate())])
+        autop.add_output_constraints([(Shard(0), Shard(2))])
+        sharding_placement = autop.optimize_placement(verbose=args.verbose)
+        parallel_mod = autop.apply_placement(sharding_placement)
+
+    print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s")
+
+    parallel_mod.to_empty(device=device)
+    parallel_mod.init_weights(buffer_device=device, seed=args.seed)  # type: ignore[operator]
+
+    if args.compile:
+        parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend())
+
+    batch = make_local_tokens(args, mesh, device, model_args.vocab_size)
+    inputs = batch[:, :-1].contiguous()
+    labels = batch[:, 1:].contiguous()
+    input_microbatches = inputs.split(args.microbatch_size, dim=0)
+    label_microbatches = labels.split(args.microbatch_size, dim=0)
+    global_token_count = args.global_batch_size * args.seq_len
+    optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr)
+
+    try:
+        losses: list[float] = []
+        for step in range(args.train_steps):
+            optimizer.zero_grad(set_to_none=True)
+            step_loss = torch.zeros((), device=device)
+            for micro_inputs, micro_labels in zip(
+                input_microbatches, label_microbatches
+            ):
+                logits = parallel_mod(micro_inputs)
+                if torch.any(torch.isnan(logits)):
+                    raise RuntimeError("Found NaNs in Qwen3 forward output.")
+
+                loss = vocab_parallel_cross_entropy(
+                    logits,
+                    micro_labels,
+                    vocab_size=model_args.vocab_size,
+                    tp_group=tp_group,
+                    tp_rank=tp_rank,
+                    tp_degree=args.tp_degree,
+                    global_token_count=global_token_count,
+                )
+                if torch.any(torch.isnan(loss)):
+                    raise RuntimeError("Found NaNs in Qwen3 training loss.")
+
+                loss.backward()
+                step_loss = step_loss + loss.detach()
+
+            torch.nn.utils.clip_grad_norm_(
+                parallel_mod.parameters(), args.max_grad_norm
+            )
+            optimizer.step()
+
+            with torch.no_grad():
+                logged_loss = step_loss.clone()
+                dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM)
+                loss_value = float(logged_loss.item())
+            losses.append(loss_value)
+            print_rank0(f"step={step:03d} loss={loss_value:.6f}")
+
+        if losses[-1] >= losses[0]:
+            raise RuntimeError(
+                f"Qwen3 training loss did not improve: initial={losses[0]:.6f}, "
+                f"final={losses[-1]:.6f}"
+            )
+
+        print_rank0(f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}")
+        dist.barrier(device_ids=[device.index])
+        torch.cuda.synchronize(device)
+    finally:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/example_sanity_check_qwen3_moe.py b/examples/example_sanity_check_qwen3_moe.py
new file mode 100644
index 00000000..dd16afb7
--- /dev/null
+++ b/examples/example_sanity_check_qwen3_moe.py
@@ -0,0 +1,466 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+import time
+
+import torch
+import torch.distributed as dist
+import torch.distributed.nn.functional as dist_nn_func
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Shard
+
+from autoparallel._testing.models.qwen3 import (
+    Qwen3ModelArgs,
+    Transformer,
+    qwen3_235b_a22b_args,
+    qwen3_30b_a3b_args,
+    qwen3_moe_debug_args,
+)
+from autoparallel.api import AutoParallel
+from autoparallel.compile import autoparallel_backend
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Run a real Qwen3 MoE AutoParallel training sanity check."
+    )
+    parser.add_argument(
+        "--flavor",
+        choices=("moe-tiny", "moe-debug", "30b-a3b", "235b-a22b"),
+        default="30b-a3b",
+        help="Qwen3 MoE model size. Defaults to the real Qwen3-30B-A3B model.",
+    )
+    parser.add_argument(
+        "--global-batch-size",
+        type=int,
+        default=4,
+        help="Global batch size across data-parallel ranks.",
+    )
+    parser.add_argument(
+        "--microbatch-size",
+        type=int,
+        default=1,
+        help="Per-rank input microbatch size before EP all-gather inside the model.",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=8192,
+        help="Sequence length. Defaults to 8192 for the 4xH100 sanity run.",
+    )
+    parser.add_argument(
+        "--dp-degree",
+        type=int,
+        default=2,
+        help="Data-parallel mesh degree.",
+    )
+    parser.add_argument(
+        "--ep-degree",
+        type=int,
+        default=2,
+        help="Expert-parallel mesh degree.",
+    )
+    parser.add_argument(
+        "--train-steps",
+        type=int,
+        default=30,
+        help="Number of optimizer steps.",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=3e-4,
+        help="Optimizer learning rate.",
+    )
+    parser.add_argument(
+        "--optimizer",
+        choices=("adamw", "sgd", "none"),
+        default="adamw",
+        help="Optimizer to use after backward. Use sgd/none for large-model memory smoke runs.",
+    )
+    parser.add_argument(
+        "--max-grad-norm",
+        type=float,
+        default=1.0,
+        help="Gradient clipping max norm.",
+    )
+    parser.add_argument(
+        "--loss-chunk-size",
+        type=int,
+        default=512,
+        help=(
+            "Sequence chunk size for vocab-parallel cross entropy. "
+            "Keeps the 8192-token real-model run from materializing full-size "
+            "float logits and exp buffers at once."
+        ),
+    )
+    parser.add_argument(
+        "--skip-loss-improvement-check",
+        action="store_true",
+        help="Only require finite forward/backward/optimizer steps.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Seed for model initialization and synthetic data generation.",
+    )
+    parser.add_argument(
+        "--compile",
+        action="store_true",
+        help="Compile the placed module with the AutoParallel backend before training.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print the full AutoParallel optimizer log.",
+    )
+    return parser.parse_args()
+
+
+def make_model_args(flavor: str, seq_len: int | None) -> Qwen3ModelArgs:
+    if flavor == "moe-tiny":
+        max_seq_len = 512 if seq_len is None else seq_len
+        return Qwen3ModelArgs(
+            dim=64,
+            n_layers=1,
+            n_heads=4,
+            n_kv_heads=2,
+            head_dim=16,
+            hidden_dim=128,
+            vocab_size=128,
+            max_seq_len=max_seq_len,
+            moe_enabled=True,
+            moe_hidden_dim=32,
+            num_experts=8,
+            top_k=2,
+            route_norm=True,
+            score_before_experts=False,
+            moe_axis_name="ep",
+        )
+    overrides = {"moe_axis_name": "ep"}
+    if seq_len is not None:
+        overrides["max_seq_len"] = seq_len
+    if flavor == "moe-debug":
+        return qwen3_moe_debug_args(**overrides)
+    if flavor == "30b-a3b":
+        return qwen3_30b_a3b_args(**overrides)
+    if flavor == "235b-a22b":
+        return qwen3_235b_a22b_args(**overrides)
+    raise ValueError(f"Unknown Qwen3 MoE flavor: {flavor}")
+
+
+def init_distributed(args):
+    if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ:
+        raise RuntimeError(
+            "Run this example with torchrun, e.g. "
+            "torchrun --standalone --nproc-per-node 4 "
+            "examples/example_sanity_check_qwen3_moe.py"
+        )
+
+    world_size = int(os.environ["WORLD_SIZE"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+    expected_world_size = args.dp_degree * args.ep_degree
+    if world_size != expected_world_size:
+        raise ValueError(
+            f"WORLD_SIZE ({world_size}) must equal dp-degree * ep-degree "
+            f"({args.dp_degree} * {args.ep_degree} = {expected_world_size})."
+        )
+    if args.global_batch_size % args.dp_degree != 0:
+        raise ValueError(
+            f"global-batch-size ({args.global_batch_size}) must be divisible by "
+            f"dp-degree ({args.dp_degree})."
+        )
+
+    local_dp_batch_size = args.global_batch_size // args.dp_degree
+    local_dp_microbatch = args.microbatch_size * args.ep_degree
+    if local_dp_batch_size % local_dp_microbatch != 0:
+        raise ValueError(
+            f"local DP batch size ({local_dp_batch_size}) must be divisible by "
+            f"microbatch-size * ep-degree "
+            f"({args.microbatch_size} * {args.ep_degree} = {local_dp_microbatch})."
+        )
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    dist.init_process_group("nccl", device_id=device)
+    mesh = torch.distributed.device_mesh.init_device_mesh(
+        "cuda",
+        (args.dp_degree, args.ep_degree),
+        mesh_dim_names=("dp", "ep"),
+    )
+    return device, mesh
+
+
+def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor:
+    coordinate = mesh.get_coordinate()
+    if coordinate is None:
+        raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.")
+    dp_rank, _ep_rank = coordinate
+    local_dp_batch_size = args.global_batch_size // args.dp_degree
+
+    generator = torch.Generator(device="cpu")
+    generator.manual_seed(args.seed)
+    tokens = torch.randint(
+        0,
+        vocab_size,
+        (args.global_batch_size, args.seq_len + 1),
+        generator=generator,
+        dtype=torch.long,
+    )
+
+    start = dp_rank * local_dp_batch_size
+    stop = start + local_dp_batch_size
+    return tokens[start:stop].to(device, non_blocking=True)
+
+
+def vocab_parallel_cross_entropy(
+    logits: torch.Tensor,
+    labels: torch.Tensor,
+    *,
+    vocab_size: int,
+    vocab_group,
+    vocab_rank: int,
+    vocab_degree: int,
+    global_token_count: int,
+) -> torch.Tensor:
+    if logits.shape[:2] != labels.shape:
+        raise ValueError(
+            f"logits shape {tuple(logits.shape)} is incompatible with "
+            f"labels shape {tuple(labels.shape)}."
+        )
+
+    local_vocab_size = logits.shape[-1]
+    vocab_start = vocab_rank * local_vocab_size
+    vocab_stop = vocab_start + local_vocab_size
+    if vocab_rank == vocab_degree - 1:
+        vocab_stop = vocab_size
+
+    logits = logits.float()
+    local_max = logits.amax(dim=-1)
+    with torch.no_grad():
+        global_max = local_max.detach().clone()
+        dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=vocab_group)
+
+    shifted_logits = logits - global_max.unsqueeze(-1)
+    local_exp_sum = shifted_logits.exp().sum(dim=-1)
+    global_exp_sum = dist_nn_func.all_reduce(
+        local_exp_sum,
+        op=dist.ReduceOp.SUM,
+        group=vocab_group,
+    )
+
+    target_mask = (labels >= vocab_start) & (labels < vocab_stop)
+    local_target = torch.zeros_like(labels, dtype=torch.long)
+    local_target[target_mask] = labels[target_mask] - vocab_start
+    local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1)
+    local_target_logits = local_target_logits * target_mask.to(logits.dtype)
+    target_logits = dist_nn_func.all_reduce(
+        local_target_logits,
+        op=dist.ReduceOp.SUM,
+        group=vocab_group,
+    )
+
+    loss_sum = (global_exp_sum.log() + global_max - target_logits).sum()
+    return loss_sum / (global_token_count * vocab_degree)
+
+
+def chunk_ranges(size: int, chunk_size: int):
+    if chunk_size <= 0:
+        yield 0, size
+        return
+    for start in range(0, size, chunk_size):
+        yield start, min(start + chunk_size, size)
+
+
+def print_rank0(message: str) -> None:
+    if dist.get_rank() == 0:
+        print(message, flush=True)
+
+
+def print_cuda_memory(stage: str, device: torch.device) -> None:
+    allocated = torch.cuda.memory_allocated(device) / 1024**3
+    reserved = torch.cuda.memory_reserved(device) / 1024**3
+    max_reserved = torch.cuda.max_memory_reserved(device) / 1024**3
+    print_rank0(
+        f"{stage}: cuda allocated={allocated:.2f}GiB "
+        f"reserved={reserved:.2f}GiB max_reserved={max_reserved:.2f}GiB"
+    )
+
+
+def main():
+    args = parse_args()
+    logging.basicConfig(level=logging.DEBUG)
+
+    device, mesh = init_distributed(args)
+    ep_group = mesh.get_group("ep")
+    ep_rank = mesh.get_local_rank("ep")
+    local_dp_batch_size = args.global_batch_size // args.dp_degree
+    local_dp_microbatch = args.microbatch_size * args.ep_degree
+    gradient_accumulation_steps = local_dp_batch_size // local_dp_microbatch
+
+    torch.manual_seed(args.seed)
+    model_args = make_model_args(args.flavor, args.seq_len)
+    if args.seq_len is None:
+        args.seq_len = model_args.max_seq_len
+    if model_args.num_experts % args.ep_degree != 0:
+        raise ValueError(
+            f"num_experts ({model_args.num_experts}) must be divisible by "
+            f"ep-degree ({args.ep_degree})."
+        )
+    trace_global_batch_size = args.microbatch_size * args.dp_degree * args.ep_degree
+
+    with torch.device("meta"):
+        model = Transformer(model_args, mesh=mesh, moe_axis_name="ep")
+
+    def input_fn():
+        return torch.randint(
+            0,
+            model_args.vocab_size,
+            (trace_global_batch_size, args.seq_len),
+            device=device,
+        )
+
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.float32,
+    )
+
+    print_rank0(
+        f"Qwen3 {args.flavor} sanity check: "
+        f"mesh=(dp={args.dp_degree}, ep={args.ep_degree}), "
+        f"global_batch={args.global_batch_size}, "
+        f"local_dp_batch={local_dp_batch_size}, "
+        f"per_rank_microbatch={args.microbatch_size}, "
+        f"local_dp_microbatch={local_dp_microbatch}, "
+        f"grad_accum={gradient_accumulation_steps}, "
+        f"trace_global_batch={trace_global_batch_size}, "
+        f"seq_len={args.seq_len}, "
+        f"loss_chunk_size={args.loss_chunk_size}, "
+        f"optimizer={args.optimizer}"
+    )
+
+    t0 = time.time()
+    with AutoParallel(
+        model,
+        input_fn,
+        mesh,
+        mp_policy,
+        dynamic=True,
+        repeated_subgraphs=True,
+    ) as autop:
+        autop.add_parameter_memory_constraint(low=None, high=None)
+        autop.add_input_constraints([(Shard(0), Shard(0))])
+        autop.add_output_constraints([(Shard(0), Shard(2))])
+        sharding_placement = autop.optimize_placement(verbose=args.verbose)
+        parallel_mod = autop.apply_placement(sharding_placement)
+
+    print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s")
+    print_cuda_memory("after AutoParallel", device)
+
+    parallel_mod.to_empty(device=device)
+    print_cuda_memory("after to_empty", device)
+    parallel_mod.init_weights(buffer_device=device, seed=args.seed)  # type: ignore[operator]
+    print_cuda_memory("after init_weights", device)
+
+    if args.compile:
+        parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend())
+
+    batch = make_local_tokens(args, mesh, device, model_args.vocab_size)
+    inputs = batch[:, :-1].contiguous()
+    labels = batch[:, 1:].contiguous()
+
+    ep_coordinate = mesh.get_coordinate()[1]
+    input_microbatches = []
+    label_microbatches = []
+    for start in range(0, local_dp_batch_size, local_dp_microbatch):
+        stop = start + local_dp_microbatch
+        input_block = inputs[start:stop]
+        input_start = ep_coordinate * args.microbatch_size
+        input_stop = input_start + args.microbatch_size
+        input_microbatches.append(input_block[input_start:input_stop].contiguous())
+        label_microbatches.append(labels[start:stop].contiguous())
+
+    global_token_count = args.global_batch_size * args.seq_len
+    if args.optimizer == "adamw":
+        optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr)
+    elif args.optimizer == "sgd":
+        optimizer = torch.optim.SGD(parallel_mod.parameters(), lr=args.lr)
+    else:
+        optimizer = None
+
+    try:
+        losses: list[float] = []
+        for step in range(args.train_steps):
+            if optimizer is not None:
+                optimizer.zero_grad(set_to_none=True)
+            else:
+                parallel_mod.zero_grad(set_to_none=True)
+            step_loss = torch.zeros((), device=device)
+            for micro_inputs, micro_labels in zip(
+                input_microbatches, label_microbatches
+            ):
+                logits = parallel_mod(micro_inputs)
+
+                seq_ranges = list(chunk_ranges(logits.shape[1], args.loss_chunk_size))
+                for chunk_idx, (seq_start, seq_stop) in enumerate(seq_ranges):
+                    logits_chunk = logits[:, seq_start:seq_stop]
+                    labels_chunk = micro_labels[:, seq_start:seq_stop]
+                    loss = vocab_parallel_cross_entropy(
+                        logits_chunk,
+                        labels_chunk,
+                        vocab_size=model_args.vocab_size,
+                        vocab_group=ep_group,
+                        vocab_rank=ep_rank,
+                        vocab_degree=args.ep_degree,
+                        global_token_count=global_token_count,
+                    )
+                    if torch.any(torch.isnan(loss)):
+                        raise RuntimeError("Found NaNs in Qwen3 MoE training loss.")
+
+                    retain_graph = chunk_idx != len(seq_ranges) - 1
+                    loss.backward(retain_graph=retain_graph)
+                    step_loss = step_loss + loss.detach()
+
+            torch.nn.utils.clip_grad_norm_(
+                parallel_mod.parameters(), args.max_grad_norm
+            )
+            if optimizer is not None:
+                optimizer.step()
+
+            with torch.no_grad():
+                logged_loss = step_loss.clone()
+                dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM)
+                loss_value = float(logged_loss.item())
+            losses.append(loss_value)
+            print_rank0(f"step={step:03d} loss={loss_value:.6f}")
+            print_cuda_memory(f"after step {step:03d}", device)
+
+        if (
+            not args.skip_loss_improvement_check
+            and len(losses) > 1
+            and losses[-1] >= losses[0]
+        ):
+            raise RuntimeError(
+                f"Qwen3 MoE training loss did not improve: "
+                f"initial={losses[0]:.6f}, final={losses[-1]:.6f}"
+            )
+
+        if len(losses) > 1:
+            print_rank0(
+                f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}"
+            )
+        dist.barrier(device_ids=[device.index])
+        torch.cuda.synchronize(device)
+    finally:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/example_torchtitan_qwen3_dense.py b/examples/example_torchtitan_qwen3_dense.py
new file mode 100644
index 00000000..a4685d1b
--- /dev/null
+++ b/examples/example_torchtitan_qwen3_dense.py
@@ -0,0 +1,370 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import dataclasses
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+import torch.distributed.nn.functional as dist_nn_func
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+
+from autoparallel.api import AutoParallel
+from autoparallel.compile import autoparallel_backend
+
+
+def _add_sibling_torchtitan_to_path() -> None:
+    repo_root = Path(__file__).resolve().parents[1]
+    torchtitan_root = repo_root.parent / "torchtitan"
+    if torchtitan_root.exists():
+        sys.path.insert(0, str(torchtitan_root))
+
+
+_add_sibling_torchtitan_to_path()
+
+from torchtitan.models.qwen3 import Qwen3Model, qwen3_configs  # noqa: E402
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Run torchtitan's dense Qwen3 model through AutoParallel's "
+            "searched placement on real GPUs."
+        )
+    )
+    parser.add_argument(
+        "--flavor",
+        choices=("debugmodel", "debugmodel_fused_qkv", "0.6B", "1.7B", "4B", "8B"),
+        default="8B",
+        help="Dense torchtitan Qwen3 flavor.",
+    )
+    parser.add_argument(
+        "--global-batch-size",
+        type=int,
+        default=4,
+        help="Global batch size across data-parallel ranks.",
+    )
+    parser.add_argument(
+        "--microbatch-size",
+        type=int,
+        default=1,
+        help="Per-DP-rank microbatch size for gradient accumulation.",
+    )
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=2048,
+        help="Sequence length for the real sanity run.",
+    )
+    parser.add_argument(
+        "--dp-degree",
+        type=int,
+        default=2,
+        help="Data-parallel mesh degree.",
+    )
+    parser.add_argument(
+        "--tp-degree",
+        type=int,
+        default=2,
+        help="Tensor-parallel mesh degree.",
+    )
+    parser.add_argument(
+        "--train-steps",
+        type=int,
+        default=2,
+        help="Number of optimizer steps.",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=3e-4,
+        help="AdamW learning rate.",
+    )
+    parser.add_argument(
+        "--max-grad-norm",
+        type=float,
+        default=1.0,
+        help="Gradient clipping max norm.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Seed for model initialization and synthetic data generation.",
+    )
+    parser.add_argument(
+        "--compile",
+        action="store_true",
+        help="Compile the placed module with the AutoParallel backend before training.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print the full AutoParallel optimizer log.",
+    )
+    return parser.parse_args()
+
+
+def make_model_config(flavor: str, seq_len: int) -> Qwen3Model.Config:
+    config = qwen3_configs[flavor](attn_backend="sdpa")
+    config.rope = dataclasses.replace(config.rope, max_seq_len=seq_len)
+    return config
+
+
+def init_distributed(args):
+    if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ:
+        raise RuntimeError(
+            "Run this example with torchrun, e.g. "
+            "torchrun --standalone --nproc-per-node 4 "
+            "examples/example_torchtitan_qwen3_dense.py"
+        )
+
+    world_size = int(os.environ["WORLD_SIZE"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+    expected_world_size = args.dp_degree * args.tp_degree
+    if world_size != expected_world_size:
+        raise ValueError(
+            f"WORLD_SIZE ({world_size}) must equal dp-degree * tp-degree "
+            f"({args.dp_degree} * {args.tp_degree} = {expected_world_size})."
+        )
+    if args.global_batch_size % args.dp_degree != 0:
+        raise ValueError(
+            f"global-batch-size ({args.global_batch_size}) must be divisible by "
+            f"dp-degree ({args.dp_degree})."
+        )
+    local_batch_size = args.global_batch_size // args.dp_degree
+    if local_batch_size % args.microbatch_size != 0:
+        raise ValueError(
+            f"local batch size ({local_batch_size}) must be divisible by "
+            f"microbatch-size ({args.microbatch_size})."
+        )
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    dist.init_process_group("nccl", device_id=device)
+    mesh = torch.distributed.device_mesh.init_device_mesh(
+        "cuda",
+        (args.dp_degree, args.tp_degree),
+        mesh_dim_names=("dp", "tp"),
+    )
+    return device, mesh
+
+
+def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor:
+    coordinate = mesh.get_coordinate()
+    if coordinate is None:
+        raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.")
+    dp_rank, _tp_rank = coordinate
+    local_batch_size = args.global_batch_size // args.dp_degree
+
+    generator = torch.Generator(device="cpu")
+    generator.manual_seed(args.seed)
+    tokens = torch.randint(
+        0,
+        vocab_size,
+        (args.global_batch_size, args.seq_len + 1),
+        generator=generator,
+        dtype=torch.long,
+    )
+
+    start = dp_rank * local_batch_size
+    stop = start + local_batch_size
+    return tokens[start:stop].to(device, non_blocking=True)
+
+
+def vocab_parallel_cross_entropy(
+    logits: torch.Tensor,
+    labels: torch.Tensor,
+    *,
+    vocab_size: int,
+    tp_group,
+    tp_rank: int,
+    tp_degree: int,
+    global_token_count: int,
+) -> torch.Tensor:
+    if logits.shape[:2] != labels.shape:
+        raise ValueError(
+            f"logits shape {tuple(logits.shape)} is incompatible with "
+            f"labels shape {tuple(labels.shape)}."
+        )
+
+    local_vocab_size = logits.shape[-1]
+    vocab_start = tp_rank * local_vocab_size
+    vocab_stop = vocab_start + local_vocab_size
+    if tp_rank == tp_degree - 1:
+        vocab_stop = vocab_size
+
+    logits = logits.float()
+    local_max = logits.amax(dim=-1)
+    with torch.no_grad():
+        global_max = local_max.detach().clone()
+        dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group)
+
+    shifted_logits = logits - global_max.unsqueeze(-1)
+    local_exp_sum = shifted_logits.exp().sum(dim=-1)
+    global_exp_sum = dist_nn_func.all_reduce(
+        local_exp_sum,
+        op=dist.ReduceOp.SUM,
+        group=tp_group,
+    )
+
+    target_mask = (labels >= vocab_start) & (labels < vocab_stop)
+    local_target = torch.zeros_like(labels, dtype=torch.long)
+    local_target[target_mask] = labels[target_mask] - vocab_start
+    local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1)
+    local_target_logits = local_target_logits * target_mask.to(logits.dtype)
+    target_logits = dist_nn_func.all_reduce(
+        local_target_logits,
+        op=dist.ReduceOp.SUM,
+        group=tp_group,
+    )
+
+    loss_sum = (global_exp_sum.log() + global_max - target_logits).sum()
+    return loss_sum / (global_token_count * tp_degree)
+
+
+def print_rank0(message: str) -> None:
+    if dist.get_rank() == 0:
+        print(message, flush=True)
+
+
+def main():
+    args = parse_args()
+    logging.basicConfig(level=logging.DEBUG)
+
+    device, mesh = init_distributed(args)
+    tp_group = mesh.get_group("tp")
+    tp_rank = mesh.get_local_rank("tp")
+    local_batch_size = args.global_batch_size // args.dp_degree
+    gradient_accumulation_steps = local_batch_size // args.microbatch_size
+
+    torch.manual_seed(args.seed)
+    model_config = make_model_config(args.flavor, args.seq_len)
+    vocab_size = model_config.vocab_size
+
+    with torch.device("meta"):
+        model = model_config.build()
+
+    def input_fn():
+        return torch.randint(
+            0,
+            vocab_size,
+            (args.global_batch_size, args.seq_len),
+            device=device,
+        )
+
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.float32,
+    )
+
+    print_rank0(
+        f"torchtitan Qwen3 {args.flavor} via AutoParallel: "
+        f"mesh=(dp={args.dp_degree}, tp={args.tp_degree}), "
+        f"global_batch={args.global_batch_size}, "
+        f"local_batch={local_batch_size}, "
+        f"microbatch={args.microbatch_size}, "
+        f"grad_accum={gradient_accumulation_steps}, "
+        f"seq_len={args.seq_len}"
+    )
+
+    t0 = time.time()
+    with AutoParallel(
+        model,
+        input_fn,
+        mesh,
+        mp_policy,
+        repeated_subgraphs=True,
+    ) as autop:
+        autop.add_parameter_memory_constraint(low=None, high=None)
+        autop.add_input_constraints([(Shard(0), Replicate())])
+        autop.add_output_constraints([(Shard(0), Shard(2))])
+        sharding_placement = autop.optimize_placement(verbose=args.verbose)
+        parallel_mod = autop.apply_placement(sharding_placement)
+
+    print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s")
+
+    parallel_mod.to_empty(device=device)
+    torch.manual_seed(args.seed)
+    parallel_mod.init_weights(buffer_device=device)  # type: ignore[operator]
+
+    if args.compile:
+        parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend())
+
+    batch = make_local_tokens(args, mesh, device, vocab_size)
+    inputs = batch[:, :-1].contiguous()
+    labels = batch[:, 1:].contiguous()
+    input_microbatches = torch.split(inputs, args.microbatch_size, dim=0)
+    label_microbatches = torch.split(labels, args.microbatch_size, dim=0)
+
+    global_token_count = args.global_batch_size * args.seq_len
+    optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr)
+
+    try:
+        losses: list[float] = []
+        for step in range(args.train_steps):
+            optimizer.zero_grad(set_to_none=True)
+            step_loss = torch.zeros((), device=device)
+            for micro_inputs, micro_labels in zip(
+                input_microbatches, label_microbatches
+            ):
+                logits = parallel_mod(micro_inputs)
+                if torch.any(torch.isnan(logits)):
+                    raise RuntimeError("Found NaNs in forward output.")
+
+                loss = vocab_parallel_cross_entropy(
+                    logits,
+                    micro_labels,
+                    vocab_size=vocab_size,
+                    tp_group=tp_group,
+                    tp_rank=tp_rank,
+                    tp_degree=args.tp_degree,
+                    global_token_count=global_token_count,
+                )
+                if torch.any(torch.isnan(loss)):
+                    raise RuntimeError("Found NaNs in training loss.")
+
+                loss.backward()
+                step_loss = step_loss + loss.detach()
+
+            torch.nn.utils.clip_grad_norm_(
+                parallel_mod.parameters(), args.max_grad_norm
+            )
+            optimizer.step()
+
+            with torch.no_grad():
+                logged_loss = step_loss.clone()
+                dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM)
+                loss_value = float(logged_loss.item())
+            losses.append(loss_value)
+            print_rank0(f"step={step:03d} loss={loss_value:.6f}")
+
+        if len(losses) > 1 and losses[-1] >= losses[0]:
+            raise RuntimeError(
+                f"Training loss did not improve: "
+                f"initial={losses[0]:.6f}, final={losses[-1]:.6f}"
+            )
+
+        if len(losses) > 1:
+            print_rank0(
+                f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}"
+            )
+        else:
+            print_rank0(f"Completed one step: loss={losses[0]:.6f}")
+        dist.barrier(device_ids=[device.index])
+        torch.cuda.synchronize(device)
+    finally:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_dsv3_torchtitan_config.py b/tests/test_dsv3_torchtitan_config.py
new file mode 100644
index 00000000..e009206b
--- /dev/null
+++ b/tests/test_dsv3_torchtitan_config.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+
+from autoparallel._testing.models.dsv3 import DeepSeekV3Model
+
+
+def test_dsv3_accepts_torchtitan_grouped_experts_config():
+    torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan"
+    if not torchtitan_root.exists():
+        pytest.skip("torchtitan sibling checkout not found")
+    sys.path.insert(0, str(torchtitan_root))
+
+    try:
+        from torchtitan.models.deepseek_v3 import deepseekv3_configs  # type: ignore[import-not-found]
+    except Exception as exc:
+        pytest.skip(f"torchtitan DeepSeek-V3 config unavailable: {exc}")
+
+    with torch.device("meta"):
+        model = DeepSeekV3Model(
+            deepseekv3_configs["debugmodel"](
+                attn_backend="sdpa",
+                moe_comm_backend="standard",
+            )
+        )
+
+    moe_layer = next(layer for layer in model.layers.values() if layer.moe_enabled)
+    assert moe_layer.moe.experts.use_grouped_mm
diff --git a/tests/test_qwen3.py b/tests/test_qwen3.py
new file mode 100644
index 00000000..5b32bc5b
--- /dev/null
+++ b/tests/test_qwen3.py
@@ -0,0 +1,323 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import sys
+from pathlib import Path
+
+import pytest
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor.placement_types import Replicate, Shard
+
+from autoparallel._testing.models.qwen3 import (
+    Qwen3ModelArgs,
+    Transformer,
+    apply_rotary_emb_cos_sin,
+    qwen3_debug_args,
+    qwen3_args_from_torchtitan_config,
+    qwen3_moe_debug_args,
+)
+from autoparallel.api import AutoParallel, auto_parallel
+
+
+def _tiny_args(**overrides) -> Qwen3ModelArgs:
+    args = Qwen3ModelArgs(
+        dim=64,
+        n_layers=2,
+        n_heads=4,
+        n_kv_heads=2,
+        head_dim=16,
+        hidden_dim=128,
+        vocab_size=128,
+        max_seq_len=16,
+    )
+    for key, value in overrides.items():
+        setattr(args, key, value)
+    args.__post_init__()
+    return args
+
+
+def _tiny_moe_args(**overrides) -> Qwen3ModelArgs:
+    args = Qwen3ModelArgs(
+        dim=32,
+        n_layers=1,
+        n_heads=4,
+        n_kv_heads=2,
+        head_dim=8,
+        hidden_dim=64,
+        vocab_size=64,
+        max_seq_len=4,
+        moe_enabled=True,
+        moe_hidden_dim=16,
+        num_experts=64,
+        top_k=8,
+        route_norm=True,
+        score_before_experts=False,
+        moe_axis_name="tp",
+    )
+    for key, value in overrides.items():
+        setattr(args, key, value)
+    args.__post_init__()
+    return args
+
+
+def test_qwen3_forward_shape():
+    args = _tiny_args()
+    model = Transformer(args)
+    model.init_weights(seed=0)
+
+    tokens = torch.randint(0, args.vocab_size, (2, args.max_seq_len))
+    logits = model(tokens)
+
+    assert logits.shape == (2, args.max_seq_len, args.vocab_size)
+
+
+def test_qwen3_qk_norm_changes_logits():
+    args = _tiny_args(n_layers=1)
+    model = Transformer(args)
+    model.init_weights(seed=0)
+
+    tokens = torch.randint(0, args.vocab_size, (2, args.max_seq_len))
+    logits = model(tokens)
+
+    with torch.no_grad():
+        model.layers["0"].attention.q_norm.weight.zero_()
+    logits_without_q = model(tokens)
+
+    assert not torch.allclose(logits, logits_without_q)
+
+
+def test_qwen3_weight_tying_survives_init_weights():
+    args = _tiny_args(enable_weight_tying=True)
+    model = Transformer(args)
+
+    assert model.tok_embeddings.weight is model.lm_head.weight
+    model.init_weights(seed=0)
+    assert model.tok_embeddings.weight is model.lm_head.weight
+
+
+def test_qwen3_debug_args_matches_torchtitan_dense_shape():
+    args = qwen3_debug_args(max_seq_len=32)
+
+    assert args.dim == 256
+    assert args.n_layers == 8
+    assert args.n_heads == 16
+    assert args.n_kv_heads == 8
+    assert args.head_dim == 128
+    assert args.hidden_dim == 3072
+    assert args.vocab_size == 2048
+    assert args.rope_theta == 1000000.0
+    assert args.enable_weight_tying
+
+
+def test_qwen3_moe_debug_args_matches_torchtitan_shape():
+    args = qwen3_moe_debug_args(max_seq_len=32)
+
+    assert args.dim == 256
+    assert args.n_layers == 8
+    assert args.n_heads == 16
+    assert args.n_kv_heads == 8
+    assert args.head_dim == 128
+    assert args.moe_enabled
+    assert args.moe_hidden_dim == 768
+    assert args.num_experts == 64
+    assert args.top_k == 8
+    assert args.route_norm
+    assert not args.score_before_experts
+
+
+@pytest.mark.parametrize(
+    ("flavor", "expected"),
+    [
+        (
+            "8B",
+            {
+                "dim": 4096,
+                "n_layers": 36,
+                "n_heads": 32,
+                "n_kv_heads": 8,
+                "head_dim": 128,
+                "hidden_dim": 12288,
+                "vocab_size": 151936,
+                "moe_enabled": False,
+                "num_experts": 0,
+                "top_k": 1,
+                "max_seq_len": 4096,
+            },
+        ),
+        (
+            "30B-A3B",
+            {
+                "dim": 2048,
+                "n_layers": 48,
+                "n_heads": 32,
+                "n_kv_heads": 4,
+                "head_dim": 128,
+                "hidden_dim": 0,
+                "vocab_size": 151936,
+                "moe_enabled": True,
+                "moe_hidden_dim": 768,
+                "num_experts": 128,
+                "top_k": 8,
+                "route_norm": True,
+                "score_before_experts": False,
+                "max_seq_len": 262144,
+            },
+        ),
+    ],
+)
+def test_qwen3_args_from_torchtitan_config(flavor, expected):
+    torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan"
+    if not torchtitan_root.exists():
+        pytest.skip("torchtitan sibling checkout not found")
+    sys.path.insert(0, str(torchtitan_root))
+
+    try:
+        from torchtitan.models.qwen3 import qwen3_configs  # type: ignore[import-not-found]
+    except Exception as exc:
+        pytest.skip(f"torchtitan Qwen3 config unavailable: {exc}")
+
+    args = qwen3_args_from_torchtitan_config(
+        qwen3_configs[flavor](attn_backend="sdpa")
+    )
+
+    for attr, value in expected.items():
+        assert getattr(args, attr) == value
+    assert args.rope_theta == 1000000.0
+    assert args.norm_eps == 1e-6
+
+
+def test_qwen3_cos_sin_rope_matches_torchtitan_helper():
+    torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan"
+    if not torchtitan_root.exists():
+        pytest.skip("torchtitan sibling checkout not found")
+    sys.path.insert(0, str(torchtitan_root))
+
+    try:
+        from torchtitan.models.common.rope import (  # type: ignore[import-not-found]
+            RoPE,
+            apply_rotary_emb_cos_sin as tt_apply_rotary_emb_cos_sin,
+        )
+    except Exception as exc:
+        pytest.skip(f"torchtitan Qwen3 RoPE helper unavailable: {exc}")
+
+    args = _tiny_args()
+    rope = RoPE(
+        RoPE.Config(
+            dim=args.head_dim,
+            max_seq_len=args.max_seq_len,
+            theta=args.rope_theta,
+            backend="cos_sin",
+        )
+    )
+    xq = torch.randn(2, args.max_seq_len, args.n_heads, args.head_dim)
+    xk = torch.randn(2, args.max_seq_len, args.n_kv_heads, args.head_dim)
+
+    actual = apply_rotary_emb_cos_sin(xq, xk, rope.cache)
+    expected = tt_apply_rotary_emb_cos_sin(xq, xk, rope.cache)
+
+    torch.testing.assert_close(actual[0], expected[0])
+    torch.testing.assert_close(actual[1], expected[1])
+
+
+def test_qwen3_autoparallel_pipeline_smoke(device_mesh_2d):
+    args = _tiny_args(n_layers=2, max_seq_len=8)
+    batch_size = 2 * device_mesh_2d.shape[0]
+
+    with torch.device("meta"):
+        model = Transformer(args)
+
+    def input_fn():
+        return torch.randint(
+            0,
+            args.vocab_size,
+            (batch_size, args.max_seq_len),
+            device="cuda",
+        )
+
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.float32,
+    )
+
+    with AutoParallel(
+        model,
+        input_fn,
+        device_mesh_2d,
+        mp_policy,
+        repeated_subgraphs=True,
+    ) as autop:
+        autop.add_input_constraints([(Shard(0), Replicate())])
+        autop.add_output_constraints([(Shard(0), Shard(2))])
+        sharding_placement = autop.optimize_placement(verbose=False)
+        parallel_mod = autop.apply_placement(sharding_placement)
+
+    assert isinstance(parallel_mod, Transformer)
+
+
+def test_qwen3_moe_auto_parallel_smoke(device_mesh_2d):
+    args = _tiny_moe_args()
+    local_batch_size = 1
+
+    with torch.device("meta"):
+        model = Transformer(args, mesh=device_mesh_2d, moe_axis_name="tp")
+
+    expected_param_shapes = {
+        name: tuple(param.shape) for name, param in model.named_parameters()
+    }
+    expected_nparams = sum(param.numel() for param in model.parameters())
+
+    tokens = DTensor.from_local(
+        torch.randint(
+            0,
+            args.vocab_size,
+            (local_batch_size, args.max_seq_len),
+            device="cuda",
+        ),
+        device_mesh_2d,
+        [Shard(0), Shard(0)],
+    )
+
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.float32,
+    )
+    parallel_mod = auto_parallel(
+        model,
+        device_mesh_2d,
+        sample_inputs=(tokens,),
+        out_shardings=(Shard(0), Shard(2)),
+        mp_policy=mp_policy,
+        dynamic=True,
+    )
+
+    assert isinstance(parallel_mod, Transformer)
+    assert sum(param.numel() for param in parallel_mod.parameters()) == expected_nparams
+    assert {
+        name: tuple(param.shape) for name, param in parallel_mod.named_parameters()
+    } == expected_param_shapes
+    assert parallel_mod.layers["0"].moe.experts.w1.shape == (
+        args.num_experts,
+        args.moe_hidden_dim,
+        args.dim,
+    )
+
+    parallel_mod.to_empty(device="cuda")
+    parallel_mod.init_weights(buffer_device=torch.device("cuda"), seed=0)
+
+    local_tokens = torch.randint(
+        0,
+        args.vocab_size,
+        (local_batch_size, args.max_seq_len),
+        device="cuda",
+    )
+    out = parallel_mod(local_tokens)
+    assert out.shape == (
+        local_batch_size * device_mesh_2d.shape[1],
+        args.max_seq_len,
+        args.vocab_size // device_mesh_2d.shape[1],
+    )
+    out.backward(torch.randn_like(out))

From b02ac054facca0aaaaa70b380f550bd3d2f85188 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Thu, 28 May 2026 09:47:51 -0700
Subject: [PATCH 02/27] Add sharding optimizer profiling snapshot

Record optimizer setup and solve profiling in ShardingOptimizer, add a contributor pipeline document, and include the profiling result artifacts used to inspect LLaMA and Qwen behavior.

Authored with Claude.
---
 autoparallel/optimize_sharding.py             |    348 +-
 docs/codebase_pipeline.md                     |    593 +
 examples/example_llama3.py                    |      7 +-
 ...ama3_3b_ilp_node_indegree_distribution.svg |     51 +
 .../llama3_8b_4x4_strategy_full.json          | 287470 +++++++++++++++
 .../llama3_8b_4x4_strategy_summary.json       |   2054 +
 .../real_llama3_3b_dag_node_stats.csv         |   7200 +
 .../real_llama3_3b_dag_summary.json           |    883 +
 .../real_llama3_3b_merge_points.csv           |   1668 +
 profile_results/real_llama3_by_mesh_dim.svg   |    167 +
 profile_results/real_llama3_by_model_size.svg |    177 +
 profile_results/real_llama3_dag_analysis.py   |    255 +
 .../real_llama3_optimizer_presolve_3d4d.log   |      7 +
 .../real_llama3_optimizer_sweep.csv           |      9 +
 .../real_llama3_optimizer_sweep.jsonl         |      8 +
 .../real_llama3_optimizer_sweep.log           |     54 +
 .../real_llama3_optimizer_sweep.py            |    351 +
 .../real_llama3_partial_presolve.csv          |      3 +
 profile_results/real_llama3_timeouts.csv      |      3 +
 pyproject.toml                                |      6 +
 qwen3_8b_autoparallel_30steps.log             |      1 +
 qwen3_8b_autoparallel_30steps_loss_curve.png  |      1 +
 qwen3_8b_autoparallel_30steps_loss_curve.svg  |      1 +
 qwen3_8b_autoparallel_30steps_losses.csv      |      1 +
 qwen3_moe_mast_20steps_loss_curve.png         |    Bin 0 -> 19666 bytes
 qwen3_moe_mast_20steps_loss_curve.svg         |     68 +
 qwen3_moe_mast_20steps_losses.csv             |     21 +
 27 files changed, 301396 insertions(+), 11 deletions(-)
 create mode 100644 docs/codebase_pipeline.md
 create mode 100644 profile_results/llama3_3b_ilp_node_indegree_distribution.svg
 create mode 100644 profile_results/llama3_8b_4x4_strategy_full.json
 create mode 100644 profile_results/llama3_8b_4x4_strategy_summary.json
 create mode 100644 profile_results/real_llama3_3b_dag_node_stats.csv
 create mode 100644 profile_results/real_llama3_3b_dag_summary.json
 create mode 100644 profile_results/real_llama3_3b_merge_points.csv
 create mode 100644 profile_results/real_llama3_by_mesh_dim.svg
 create mode 100644 profile_results/real_llama3_by_model_size.svg
 create mode 100644 profile_results/real_llama3_dag_analysis.py
 create mode 100644 profile_results/real_llama3_optimizer_presolve_3d4d.log
 create mode 100644 profile_results/real_llama3_optimizer_sweep.csv
 create mode 100644 profile_results/real_llama3_optimizer_sweep.jsonl
 create mode 100644 profile_results/real_llama3_optimizer_sweep.log
 create mode 100644 profile_results/real_llama3_optimizer_sweep.py
 create mode 100644 profile_results/real_llama3_partial_presolve.csv
 create mode 100644 profile_results/real_llama3_timeouts.csv
 create mode 120000 qwen3_8b_autoparallel_30steps.log
 create mode 120000 qwen3_8b_autoparallel_30steps_loss_curve.png
 create mode 120000 qwen3_8b_autoparallel_30steps_loss_curve.svg
 create mode 120000 qwen3_8b_autoparallel_30steps_losses.csv
 create mode 100644 qwen3_moe_mast_20steps_loss_curve.png
 create mode 100644 qwen3_moe_mast_20steps_loss_curve.svg
 create mode 100644 qwen3_moe_mast_20steps_losses.csv

diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 9692ef2f..2b1909ee 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -241,14 +241,36 @@ def __init__(
         )
         self._constraint_log: list[tuple[str, dict]] = []
         self._name_counters: dict[str, int] = {}
+        self.profile: dict[str, Any] = {
+            "mesh": self._profile_mesh(),
+            "model": self._profile_model(),
+            "timings": {},
+        }
+        t_init_start = time.perf_counter()
         t0 = time.perf_counter()
         self.strats = self.build_sharding_metadata()
+        t_strategy = time.perf_counter() - t0
+        self.profile["timings"]["strategy_enumeration_s"] = t_strategy
+        self.profile["strategies"] = self._profile_strategies()
+        logger.info(
+            "ShardingOptimizer phase profile: phase=strategy_enumeration "
+            "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s "
+            "graph_nodes=%s strategy_options=%s option_tuples=%s elapsed=%.3fs",
+            self.profile["mesh"]["shape"],
+            self.profile["mesh"]["dim_names"],
+            self.profile["mesh"]["size"],
+            self._format_billions(self.profile["model"]["parameter_numel"]),
+            self.profile["model"]["graph_nodes"],
+            self.profile["strategies"]["strategy_options"],
+            self.profile["strategies"]["option_tuples"],
+            t_strategy,
+        )
         # nodes/node_map are derived from strats (not graph.nodes) so that
         # shape-computation nodes skipped by build_sharding_metadata don't
         # appear and indices stay consistent.
         self.nodes = list(self.strats.keys())
         self.node_map = {node: i for i, node in enumerate(self.nodes)}
-        logger.debug("Placement options took %.3fs", time.perf_counter() - t0)
+        logger.debug("Placement options took %.3fs", t_strategy)
         from autoparallel.shardings.placement_options import get_placement_options_timer
 
         get_placement_options_timer().report()
@@ -263,13 +285,77 @@ def __init__(
         t0 = time.perf_counter()
         self.decision_vars = self._build_decision_vars()
         t1 = time.perf_counter()
+        logger.info(
+            "ShardingOptimizer phase profile: phase=decision_vars "
+            "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s "
+            "unique_ilp_vars=%s logical_decision_vars=%s "
+            "cluster_copied_decision_vars=%s pulp_var_creation=%.3fs "
+            "compute_cost=%.3fs edge_cost=%.3fs cost_estimation=%.3fs "
+            "elapsed=%.3fs",
+            self.profile["mesh"]["shape"],
+            self.profile["mesh"]["dim_names"],
+            self.profile["mesh"]["size"],
+            self._format_billions(self.profile["model"]["parameter_numel"]),
+            self._decision_var_profile["unique_pulp_variables"],
+            self._decision_var_profile["logical_decision_variables"],
+            self._decision_var_profile["cluster_copied_decision_variables"],
+            self._decision_var_profile["pulp_var_creation_s"],
+            self._decision_var_profile["compute_cost_estimation_s"],
+            self._decision_var_profile["edge_cost_estimation_s"],
+            self._decision_var_profile["cost_estimation_s"],
+            t1 - t0,
+        )
         self.validate()
         t2 = time.perf_counter()
         self.prob = pulp.LpProblem("AutoParallel", pulp.LpMinimize)
         self.add_default_constraints()
         t3 = time.perf_counter()
+        decision_var_build_s = t1 - t0
+        cost_estimation_s = self._decision_var_profile["cost_estimation_s"]
+        decision_var_overhead_s = max(
+            decision_var_build_s
+            - self._decision_var_profile["pulp_var_creation_s"]
+            - cost_estimation_s,
+            0.0,
+        )
+        self.profile["timings"].update(
+            {
+                "decision_var_build_s": decision_var_build_s,
+                "decision_var_overhead_s": decision_var_overhead_s,
+                "validation_s": t2 - t1,
+                "constraint_construction_s": t3 - t2,
+                "ilp_construction_s": (
+                    self._decision_var_profile["pulp_var_creation_s"]
+                    + decision_var_overhead_s
+                    + (t3 - t2)
+                ),
+                "init_total_s": t3 - t_init_start,
+            }
+        )
         n_unique_vars = len(self.pulp_variables)
         n_constraints = len(self.prob.constraints)
+        self.profile["ilp"] = {
+            "unique_variables": n_unique_vars,
+            "logical_decision_variables": self._decision_var_profile[
+                "logical_decision_variables"
+            ],
+            "cluster_copied_decision_variables": self._decision_var_profile[
+                "cluster_copied_decision_variables"
+            ],
+            "constraints": n_constraints,
+        }
+        logger.info(
+            "ShardingOptimizer phase profile: phase=constraints "
+            "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s "
+            "unique_ilp_vars=%s constraints=%s elapsed=%.3fs",
+            self.profile["mesh"]["shape"],
+            self.profile["mesh"]["dim_names"],
+            self.profile["mesh"]["size"],
+            self._format_billions(self.profile["model"]["parameter_numel"]),
+            n_unique_vars,
+            n_constraints,
+            t3 - t2,
+        )
         logger.debug(
             "ILP construction took %.3fs "
             "(decision_vars=%.3fs, validate=%.3fs, constraints=%.3fs)",
@@ -284,6 +370,157 @@ def __init__(
             len(self.decision_vars),
             n_constraints,
         )
+        self._log_init_profile()
+
+    def _profile_mesh(self):
+        try:
+            mesh_shape = tuple(int(d) for d in self.mesh.shape)
+        except Exception:
+            mesh_shape = tuple()
+        try:
+            mesh_size = int(self.mesh.size())
+        except Exception:
+            mesh_size = math.prod(mesh_shape) if mesh_shape else None
+        return {
+            "ndim": getattr(self.mesh, "ndim", len(mesh_shape)),
+            "shape": mesh_shape,
+            "dim_names": getattr(self.mesh, "mesh_dim_names", None),
+            "size": mesh_size,
+        }
+
+    def _profile_model(self):
+        graph_nodes = list(self.graph.nodes)
+        op_counts = defaultdict(int)
+        tensor_nodes = 0
+        for node in graph_nodes:
+            op_counts[node.op] += 1
+            if _produces_tensor(node.meta.get("val")):
+                tensor_nodes += 1
+
+        param_numel = 0
+        param_bytes = 0
+        unknown_param_nodes = 0
+        try:
+            param_nodes = get_param_nodes(self.graph)
+        except Exception:
+            param_nodes = []
+            unknown_param_nodes = None
+
+        for node in param_nodes:
+            val = node.meta.get("val")
+            if not isinstance(val, torch.Tensor):
+                unknown_param_nodes += 1
+                continue
+            numel = self._safe_tensor_numel(val)
+            if numel is None:
+                unknown_param_nodes += 1
+                continue
+            param_numel += numel
+            try:
+                param_bytes += numel * val.element_size()
+            except Exception:
+                pass
+
+        return {
+            "graph_nodes": len(graph_nodes),
+            "tensor_nodes": tensor_nodes,
+            "op_counts": dict(op_counts),
+            "parameter_nodes": len(param_nodes),
+            "parameter_numel": param_numel,
+            "parameter_bytes": param_bytes,
+            "unknown_parameter_nodes": unknown_param_nodes,
+        }
+
+    @staticmethod
+    def _safe_tensor_numel(tensor):
+        try:
+            numel = tensor.numel()
+            if isinstance(numel, int):
+                return numel
+            return int(numel)
+        except Exception:
+            pass
+
+        shape = getattr(tensor, "shape", None)
+        if shape is None:
+            return None
+
+        total = 1
+        for dim in shape:
+            dim = concretize_symint(dim)
+            if not isinstance(dim, int):
+                return None
+            total *= dim
+        return total
+
+    def _profile_strategies(self):
+        strategy_options = 0
+        option_tuples = 0
+        max_strategies_per_node = 0
+        for node in self.strats:
+            if node.op == "output" or not hasattr(self.strats[node], "strategies"):
+                continue
+            strategies = self.strats[node].strategies
+            strategy_options += len(strategies)
+            max_strategies_per_node = max(max_strategies_per_node, len(strategies))
+            option_tuples += sum(1 for _ in self.walk_over_options(node))
+        return {
+            "nodes": len(self.strats),
+            "strategy_options": strategy_options,
+            "option_tuples": option_tuples,
+            "max_strategies_per_node": max_strategies_per_node,
+        }
+
+    @staticmethod
+    def _format_billions(count):
+        if count is None:
+            return "unknown"
+        if count >= 1_000_000_000:
+            return f"{count / 1_000_000_000:.2f}B"
+        if count >= 1_000_000:
+            return f"{count / 1_000_000:.2f}M"
+        return str(count)
+
+    @staticmethod
+    def _safe_float(value):
+        try:
+            return float(value)
+        except Exception:
+            return math.nan
+
+    def _log_init_profile(self):
+        mesh = self.profile["mesh"]
+        model = self.profile["model"]
+        strategies = self.profile["strategies"]
+        ilp = self.profile["ilp"]
+        timings = self.profile["timings"]
+        logger.info(
+            "ShardingOptimizer init profile: "
+            "mesh_shape=%s mesh_dim_names=%s mesh_size=%s "
+            "model_params=%s param_nodes=%s graph_nodes=%s tensor_nodes=%s "
+            "strategy_options=%s option_tuples=%s "
+            "unique_ilp_vars=%s logical_decision_vars=%s constraints=%s "
+            "timings={strategy_enumeration=%.3fs,cost_estimation=%.3fs,"
+            "ilp_construction=%.3fs,validation=%.3fs,total=%.3fs}",
+            mesh["shape"],
+            mesh["dim_names"],
+            mesh["size"],
+            self._format_billions(model["parameter_numel"]),
+            model["parameter_nodes"],
+            model["graph_nodes"],
+            model["tensor_nodes"],
+            strategies["strategy_options"],
+            strategies["option_tuples"],
+            ilp["unique_variables"],
+            ilp["logical_decision_variables"],
+            ilp["constraints"],
+            timings["strategy_enumeration_s"],
+            timings["cost_estimation_s"],
+            timings["ilp_construction_s"],
+            timings["validation_s"],
+            timings["init_total_s"],
+        )
+        logger.debug("ShardingOptimizer init profile detail: %s", self.profile)
 
     def _get_next_name(self, prefix):
         idx = self._name_counters.setdefault(prefix, 0)
@@ -580,6 +817,23 @@ def _build_decision_vars(self):
             t_compute,
             t_edge,
         )
+        self._decision_var_profile = {
+            "logical_decision_variables": n_vars,
+            "cluster_copied_decision_variables": n_cluster_copied,
+            "unique_pulp_variables": len(self.pulp_variables),
+            "pulp_var_creation_s": t_pulp_end - t_pulp_start,
+            "compute_cost_estimation_s": t_compute,
+            "edge_cost_estimation_s": t_edge,
+            "cost_estimation_s": t_compute + t_edge,
+        }
+        self.profile["timings"].update(
+            {
+                "pulp_var_creation_s": t_pulp_end - t_pulp_start,
+                "compute_cost_estimation_s": t_compute,
+                "edge_cost_estimation_s": t_edge,
+                "cost_estimation_s": t_compute + t_edge,
+            }
+        )
         return decision_vars
 
     def _resolve_decision_var(self, key):
@@ -884,9 +1138,11 @@ def _solve(self, verbose=False):
         # Use a dedicated temp directory for PuLP's intermediate files (.mps,
         # .sol, etc.) so they are always cleaned up, even if the process is
         # killed.  Without this, leftover files can fill up /tmp (tmpfs).
+        t0 = time.perf_counter()
         with tempfile.TemporaryDirectory() as tmpdir:
             solver.tmpDir = tmpdir
             self.prob.solve(solver)
+        solve_s = time.perf_counter() - t0
 
         self.selected_keys = [
             key for key, dv in self.decision_vars.items() if dv.var.value() == 1
@@ -904,6 +1160,60 @@ def _solve(self, verbose=False):
                 "constraints, and consider relaxing input/output constraints or "
                 "using a larger mesh."
             )
+        return solve_s
+
+    def _log_solve_profile(
+        self,
+        solve_kind,
+        objective_value,
+        objective_s,
+        solve_s,
+        extract_s,
+        total_s,
+    ):
+        mesh = self.profile["mesh"]
+        model = self.profile["model"]
+        timings = self.profile["timings"]
+        status = pulp.LpStatus.get(self.prob.status, self.prob.status)
+        pipeline_total_s = timings["init_total_s"] + total_s
+        logger.info(
+            "ShardingOptimizer %s profile: "
+            "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s "
+            "unique_ilp_vars=%s constraints=%s status=%s objective=%.4f "
+            "timings={strategy_enumeration=%.3fs,cost_estimation=%.3fs,"
+            "ilp_construction=%.3fs,objective=%.3fs,solve=%.3fs,"
+            "extract=%.3fs,total_solve_call=%.3fs,total_pipeline=%.3fs}",
+            solve_kind,
+            mesh["shape"],
+            mesh["dim_names"],
+            mesh["size"],
+            self._format_billions(model["parameter_numel"]),
+            len(self.pulp_variables),
+            len(self.prob.constraints),
+            status,
+            objective_value,
+            timings["strategy_enumeration_s"],
+            timings["cost_estimation_s"],
+            timings["ilp_construction_s"],
+            objective_s,
+            solve_s,
+            extract_s,
+            total_s,
+            pipeline_total_s,
+        )
+        self.profile["last_solve"] = {
+            "kind": solve_kind,
+            "objective": objective_value,
+            "status": status,
+            "constraints": len(self.prob.constraints),
+            "unique_variables": len(self.pulp_variables),
+            "objective_s": objective_s,
+            "solve_s": solve_s,
+            "extract_s": extract_s,
+            "total_s": total_s,
+            "pipeline_total_s": pipeline_total_s,
+        }
+        logger.debug("ShardingOptimizer solve profile detail: %s", self.profile)
 
     def _extract_and_validate_solution(self):
         """Validate the ILP solution and return the optimal strategy per node."""
@@ -948,13 +1258,26 @@ def _to_concrete_solution(self, solution):
 
     def get_solution(self, verbose=False):
         t0 = time.perf_counter()
+        t_objective0 = time.perf_counter()
         self._set_objective()
-        self._solve(verbose)
-        obj_value = pulp.value(self.prob.objective)
+        t_objective1 = time.perf_counter()
+        solve_s = self._solve(verbose)
+        obj_value = self._safe_float(pulp.value(self.prob.objective))
+        t_extract0 = time.perf_counter()
+        solution = self._to_orig_solution(self._extract_and_validate_solution())
+        t_extract1 = time.perf_counter()
         logger.debug(
             "ILP solve took %.3fs (objective=%.4f)", time.perf_counter() - t0, obj_value
         )
-        return self._to_orig_solution(self._extract_and_validate_solution())
+        self._log_solve_profile(
+            "solve",
+            obj_value,
+            t_objective1 - t_objective0,
+            solve_s,
+            t_extract1 - t_extract0,
+            t_extract1 - t0,
+        )
+        return solution
 
     def resolve(self, verbose=False):
         """Re-solve the ILP after adding or removing constraints.
@@ -963,14 +1286,25 @@ def resolve(self, verbose=False):
         be called multiple times after modifying constraints.
         """
         t0 = time.perf_counter()
-        self._solve(verbose)
-        obj_value = pulp.value(self.prob.objective)
+        solve_s = self._solve(verbose)
+        obj_value = self._safe_float(pulp.value(self.prob.objective))
+        t_extract0 = time.perf_counter()
+        solution = self._to_orig_solution(self._extract_and_validate_solution())
+        t_extract1 = time.perf_counter()
         logger.debug(
             "ILP re-solve took %.3fs (objective=%.4f)",
             time.perf_counter() - t0,
             obj_value,
         )
-        return self._to_orig_solution(self._extract_and_validate_solution())
+        self._log_solve_profile(
+            "re-solve",
+            obj_value,
+            0.0,
+            solve_s,
+            t_extract1 - t_extract0,
+            t_extract1 - t0,
+        )
+        return solution
 
     def remove_constraints(self, names):
         """Remove constraints by name, allowing re-solve to revert to the
diff --git a/docs/codebase_pipeline.md b/docs/codebase_pipeline.md
new file mode 100644
index 00000000..533c4c09
--- /dev/null
+++ b/docs/codebase_pipeline.md
@@ -0,0 +1,593 @@
+# AutoParallel Codebase Pipeline
+
+This document is a code-oriented guide for new contributors. It explains the
+main pipeline, the important modules, and how data moves from a user model to a
+parallelized module.
+
+AutoParallel is experimental and tightly coupled to PyTorch internals such as
+FX, Dynamo export, AOTAutograd, DTensor, and Inductor. The best mental model is:
+
+```text
+user model
+  -> fake/global tracing
+  -> joint forward/backward FX graph
+  -> per-node sharding strategy enumeration
+  -> ILP optimization
+  -> graph lowering with redistributions
+  -> parallel nn.Module with sharded params/buffers
+  -> optional torch.compile backend passes
+```
+
+## Public Entry Points
+
+The public API is exported from `autoparallel/__init__.py`:
+
+- `auto_parallel(...)`: simple wrapper for common usage.
+- `AutoParallel(...)`: context-manager API for debugging and custom constraints.
+- `autoparallel_backend(...)`: `torch.compile` backend wrapper for activation
+  checkpointing and communication/compute overlap passes.
+- `with_sharding_constraint(...)`: model-level constraint helper.
+
+The main implementation lives in `autoparallel/api.py`.
+
+## End-to-End Pipeline
+
+### 1. User Defines Model, Mesh, and Example Inputs
+
+Users provide:
+
+- an `nn.Module`, often built on the `meta` device,
+- a PyTorch `DeviceMesh`,
+- example inputs,
+- output placement constraints,
+- optionally mixed precision and parameter memory constraints.
+
+The simple API accepts real tensors or DTensors as `sample_inputs`. DTensor
+inputs are important because their placements become input constraints. Regular
+tensors are treated as replicated on every mesh dimension.
+
+Relevant files:
+
+- `autoparallel/api.py`
+- `autoparallel/input_validation.py`
+- `docs/api_walkthrough.md`
+- `examples/example_autoparallel.py`
+- `examples/example_hf.py`
+
+### 2. Input Metadata Is Normalized
+
+In `auto_parallel(...)`, sample inputs are converted into metadata:
+
+- global shapes,
+- dtypes,
+- devices,
+- input placement tuples,
+- pytree structure.
+
+This is handled by `_extract_input_info(...)` and `_make_input_fn(...)` in
+`autoparallel/input_validation.py`.
+
+The generated `input_fn()` creates fresh tensors with the same global metadata.
+It is called later inside `FakeTensorMode`, so the tensors become fake tensors
+instead of real allocations.
+
+### 3. AutoParallel Context Setup
+
+`AutoParallel.__init__` prepares the optimization environment:
+
+- deep-copies the user model so tracing and dtype wrappers do not mutate it,
+- canonicalizes and applies mixed precision wrappers if requested,
+- moves meta parameters and buffers into fake tensors on the mesh device,
+- stores the mesh, cost model, and dynamic-shape setting,
+- optionally creates a `ShapeEnv` for symbolic shapes.
+
+`AutoParallel.__enter__` then:
+
+- configures the NCCL topology cost model,
+- enters the `DeviceMesh` context,
+- traces the model into a joint graph,
+- disables Inductor comprehensive padding while AutoParallel is active,
+- constructs a `ShardingOptimizer`.
+
+Relevant files:
+
+- `autoparallel/api.py`
+- `autoparallel/tracing.py`
+- `autoparallel/cast_parametrization.py`
+- `autoparallel/cost_models/nccl_cost_model.py`
+- `autoparallel/cost_models/collective_runtime_estimation.py`
+
+### 4. Model Is Traced Into a Joint FX Graph
+
+Tracing happens in `build_joint_graph(...)` in `autoparallel/api.py`.
+
+The flow is:
+
+1. Call `input_fn()` under `FakeTensorMode`.
+2. Optionally convert fake inputs to symbolic dynamic inputs.
+3. Capture a forward graph with Dynamo export.
+4. Restore model state after capture.
+5. Add unused params and buffers so they still appear in the parameter specs.
+6. Use AOTAutograd to export a joint forward/backward graph.
+7. Clean up and normalize the graph.
+8. Optionally replace `view -> mm -> view` patterns with `einsum`.
+9. Add alias nodes to expose more optimization opportunities.
+
+The resulting graph is a single FX graph containing forward computation,
+backward computation, parameter nodes, gradients, tangents, and outputs.
+AutoParallel optimizes this joint graph rather than optimizing only the forward
+path.
+
+Relevant files:
+
+- `autoparallel/api.py`
+- `autoparallel/tracing.py`
+- `autoparallel/graph_passes/graph_utils.py`
+- `autoparallel/graph_passes/extract_forward.py`
+
+## Sharding Strategy Generation
+
+### 5. The Optimizer Builds Placement Options
+
+`ShardingOptimizer` is implemented in `autoparallel/optimize_sharding.py`.
+
+It first creates a concrete copy of the graph with symbolic dimensions replaced
+by their hinted concrete values. The optimizer uses this concrete graph for
+strategy enumeration, cost estimation, graph clustering, and ILP construction.
+The original graph is kept for `apply_sharding`, which may still need symbolic
+shape metadata.
+
+For each tensor-producing node, `build_sharding_metadata()` creates an
+`OpStrategy`. An `OpStrategy` is a list of possible `OpSpec` choices. Each
+`OpSpec` describes:
+
+- expected input DTensor specs,
+- produced output DTensor specs,
+- redistribution costs from predecessor placements.
+
+Placeholders and parameters start with all valid placements generated by
+`_create_all_options(...)`. Call-function nodes get strategies from
+`get_placement_options_for_node(...)`.
+
+Relevant files:
+
+- `autoparallel/optimize_sharding.py`
+- `autoparallel/shardings/placement_options.py`
+- `autoparallel/shardings/propagation_rules.py`
+
+### 6. Placement Rules Come From DTensor Plus AutoParallel Overrides
+
+`autoparallel/shardings/placement_options.py` dispatches strategy generation.
+
+For normal ops:
+
+- if AutoParallel has a custom rule in `_op_rules`, it uses that,
+- otherwise it asks PyTorch DTensor for an op strategy through helper wrappers.
+
+AutoParallel adds custom rules in `autoparallel/shardings/propagation_rules.py`.
+These rules cover cases where the default DTensor propagation is missing,
+too strict, or not shaped for AutoParallel's optimizer.
+
+Important examples:
+
+- view and reshape-like ops,
+- `operator.getitem`,
+- pointwise behavior,
+- tensor factory ops,
+- matmul/einsum behavior,
+- local-map and MoE-related higher-order ops,
+- flex attention higher-order ops.
+
+After strategies are generated, AutoParallel:
+
+- propagates tensor metadata,
+- fills missing redistribution costs,
+- removes invalid shardings where tensor dimensions are too small for the mesh,
+- deduplicates equivalent configurations,
+- caches repeated placement-option lookups.
+
+## Cost Model
+
+### 7. Compute Cost
+
+Compute cost is estimated in `autoparallel/cost_models/compute_estimation.py`.
+
+The broad idea is:
+
+- count FLOPs when possible,
+- estimate memory read/write time,
+- estimate compute time from device throughput,
+- use the max of memory time and compute time,
+- apply a small launch floor for tiny kernels,
+- treat pure view-like shape operations as cheap or free.
+
+The module contains hardware limit tables for several GPU families and a flop
+counter extension for `einsum`.
+
+### 8. Communication Cost
+
+Communication cost is estimated in
+`autoparallel/cost_models/collective_runtime_estimation.py`.
+
+The key transition types are:
+
+- `Shard -> Replicate`: all-gather,
+- `Partial -> Replicate`: all-reduce,
+- `Partial -> Shard`: reduce-scatter,
+- `Shard(dim_a) -> Shard(dim_b)`: all-to-all,
+- `Replicate -> Shard`: local narrowing, usually no collective.
+
+By default, `AutoParallel.__enter__` detects an NCCL topology config and the
+cost model dispatches to `autoparallel/cost_models/nccl_cost_model.py`. This is
+important because intra-node and inter-node collectives have very different
+costs.
+
+Redistribution cost also includes penalties for non-contiguous layouts and
+non-dim-0 shard reshuffling, because those cases need extra memory movement.
+
+### 9. Transition Cost
+
+The optimizer also adds a small sharding-transition penalty when a producer and
+consumer use different placements. This is a tie-breaker that encourages
+placement stability when communication and compute costs are otherwise similar.
+
+## ILP Optimization
+
+### 10. Decision Variables
+
+The ILP is built in `ShardingOptimizer`.
+
+A decision variable represents:
+
+```text
+(node, argument index, output strategy index, producer input strategy index)
+```
+
+Each variable has:
+
+- total cost,
+- compute cost,
+- communication cost,
+- transition cost,
+- selected `OpSpec`,
+- input and output DTensor specs.
+
+For repeated subgraphs, graph clustering can link equivalent decision variables
+so the ILP is smaller.
+
+Relevant files:
+
+- `autoparallel/optimize_sharding.py`
+- `autoparallel/graph_passes/graph_clustering.py`
+
+### 11. Default Constraints
+
+The optimizer adds these constraints before solving:
+
+- uniqueness: each node argument selects exactly one choice,
+- same-output consistency: all tensor arguments of a multi-input op agree on
+  one output strategy,
+- flow consistency: producer output placement matches consumer input placement,
+- invalid-cost constraints: impossible configurations cannot be selected,
+- forward/backward consistency constraints,
+- gradient-reduce dtype constraints.
+
+User-facing constraints are layered on top:
+
+- `add_input_constraints(...)`,
+- `add_output_constraints(...)`,
+- `add_parameter_memory_constraint(...)`,
+- node constraints through optimizer helpers,
+- model-embedded `with_sharding_constraint(...)`.
+
+### 12. Solving
+
+`get_solution(...)` sets the objective and solves the ILP with PuLP's CBC
+solver. The objective minimizes total estimated runtime cost across the joint
+graph:
+
+```text
+compute cost + communication cost + transition cost
+```
+
+The result is a mapping:
+
+```text
+FX node -> chosen OpSpec
+```
+
+Public debugging helpers include:
+
+- `get_log(...)`,
+- `print_costs_for_node(...)`,
+- `explain_placement(...)`,
+- `diff_solutions(...)`,
+- `save(...)` and `load(...)`,
+- `save_placements(...)` and `load_placements(...)`,
+- `get_json(...)`.
+
+## Applying the Solution
+
+### 13. Lowering the Graph to Local Execution
+
+`apply_placement(...)` calls `apply_sharding_to_model(...)` in
+`autoparallel/apply_sharding.py`.
+
+The important class is `ApplyShardingInterpreter`, an FX interpreter that walks
+the original joint graph and inserts the behavior implied by the chosen
+placements.
+
+For each operation, it:
+
+- looks up the producer specs and target input specs,
+- redistributes local tensors when placements differ,
+- handles `operator.getitem` specially for tuple outputs,
+- localizes shape arguments for tensor factories and view ops,
+- wraps view inputs in DTensor in static mode when DTensor should perform
+  global-to-local shape conversion,
+- executes the original op,
+- converts DTensor outputs back to local tensors.
+
+The output is a parallel FX graph that operates on local tensors and explicit
+collective/redistribution behavior.
+
+Relevant files:
+
+- `autoparallel/apply_sharding.py`
+- `autoparallel/shardings/ordered_sharding.py`
+
+### 14. Parameters and Buffers Are Sharded
+
+`_shard_params_and_buffers(...)` builds DTensor parameters and buffers from the
+solved placements. It uses the original graph's named parameter and buffer
+descriptors to map FQNs to FX nodes.
+
+The returned dictionaries are:
+
+```text
+fqn -> sharded Parameter
+fqn -> sharded buffer DTensor
+```
+
+`make_parallel_module(...)` then constructs the final module.
+
+Relevant files:
+
+- `autoparallel/apply_sharding.py`
+- `autoparallel/module_construction.py`
+
+### 15. Parallel Module Construction
+
+`autoparallel/module_construction.py` creates a new module class that mirrors
+the user's original model class.
+
+It preserves:
+
+- user-defined instance attributes,
+- nested module structure,
+- `ModuleDict`-like containers when possible,
+- parameter aliases,
+- buffer aliases,
+- module aliases,
+- orphan submodules needed by initialization code.
+
+It also replaces the module's `forward` with the AutoParallel-generated
+function and wraps `init_weights` if the model has one.
+
+### 16. Runtime Forward
+
+The generated `forward` in `AutoParallel.apply_placement(...)`:
+
+1. Flattens user inputs.
+2. Validates local runtime shapes and dtypes against traced expectations.
+3. Reads DTensor parameters and buffers from the module.
+4. Converts parameters and buffers to local tensors.
+5. Boxes params, buffers, and runtime inputs into the AOTAutograd-compiled
+   function.
+6. Uses the joint forward/backward function when gradients are enabled.
+7. Uses a forward-only extracted graph under `torch.no_grad()`.
+
+The returned parallel module expects local per-rank tensors at runtime, not
+global tensors.
+
+### 17. Initialization and Loading
+
+A common workflow is:
+
+```python
+with torch.device("meta"):
+    model = MyModel(...)
+
+parallel_model = auto_parallel(...)
+parallel_model.to_empty(device="cuda")
+parallel_model.init_weights()
+```
+
+`autoparallel/init_weights.py` makes typical single-GPU initialization code
+work with sharded DTensor parameters. It intercepts parameter and buffer
+assignments during `init_weights` and copies the assigned full tensor into the
+existing DTensor placement.
+
+Save/load support lives in:
+
+- `autoparallel/serialization.py`
+- `docs/save_load.md`
+
+## Optional Compilation Pipeline
+
+The eager parallel module can be passed to:
+
+```python
+torch.compile(parallel_model, backend=autoparallel_backend())
+```
+
+`autoparallel/compile.py` wraps Inductor and can enable:
+
+- activation checkpointing joint pass,
+- collective bucketing,
+- overlap scheduling,
+- insertion of overlap dependencies,
+- prefetch limits.
+
+Activation checkpointing logic is in:
+
+- `autoparallel/graph_passes/activation_checkpointing.py`
+
+Other graph and scheduling passes live under:
+
+- `autoparallel/graph_passes/`
+- `autoparallel/graph_passes/async_tp/`
+- `autoparallel/graph_passes/autobucketing_inductor/`
+
+## Important Supporting Areas
+
+### Custom Ops and Constraints
+
+`autoparallel/collectives.py` exposes sharding constraints and related
+collective helpers. Model authors can use `with_sharding_constraint(...)` inside
+model code to force an intermediate placement.
+
+`autoparallel/ops.py` contains registered AutoParallel-specific operations.
+
+### Local Map and MoE
+
+AutoParallel has special handling for `local_map` and MoE-style communication.
+Placement options for local-map higher-order ops are generated in
+`placement_options.py`, while user-facing examples and explanations are in:
+
+- `docs/hc_and_moe.md`
+- `examples/example_local_map.py`
+- `examples/example_dcp.py`
+- `examples/native_ds3/`
+
+### Dynamic Shapes
+
+When `dynamic=True`, `AutoParallel` traces with symbolic dimensions. The
+optimizer still works on a concretized graph, but `apply_sharding` preserves the
+original symbolic graph and recreates local fake inputs with fresh symbols for
+lowering. Runtime input validation allows dimensions marked dynamic to vary.
+
+Relevant files:
+
+- `autoparallel/api.py`
+- `autoparallel/optimize_sharding.py`
+- `autoparallel/apply_sharding.py`
+- `autoparallel/input_validation.py`
+- `tests/test_dynamic_shapes.py`
+
+### JSON and Visualization
+
+The optimizer can export strategy decisions to JSON with `get_json()`.
+
+Relevant files:
+
+- `autoparallel/export_json.py`
+- `autoparallel/visualizer/build_display_from_json.py`
+- `tests/test_export_json.py`
+
+## Directory Map
+
+```text
+autoparallel/
+  api.py                         public APIs and orchestration
+  tracing.py                     fake tensor conversion and decomposition setup
+  input_validation.py            sample input metadata and runtime checks
+  optimize_sharding.py           ILP optimizer and debugging helpers
+  apply_sharding.py              graph lowering and sharded param creation
+  module_construction.py         final parallel module construction
+  init_weights.py                DTensor-aware init_weights wrapper
+  compile.py                     torch.compile backend wrapper
+  collectives.py                 sharding constraints and collective helpers
+  ops.py                         custom operator registrations
+  serialization.py               optimizer and placement save/load
+  export_json.py                 visualization/export format
+
+autoparallel/shardings/
+  placement_options.py           per-node strategy generation
+  propagation_rules.py           custom DTensor propagation rules
+  dtensor_sharding_helpers.py    wrappers around DTensor strategy APIs
+  ordered_sharding.py            optimized redistribution ordering
+
+autoparallel/cost_models/
+  compute_estimation.py          operation runtime estimates
+  collective_runtime_estimation.py redistribution cost estimates
+  nccl_cost_model.py             NCCL topology-aware cost model
+
+autoparallel/graph_passes/
+  graph_utils.py                 graph cleanup and helper analysis
+  graph_clustering.py            repeated-subgraph detection
+  activation_checkpointing.py    recomputation/AC tagging and pass
+  extract_forward.py             forward-only graph extraction
+  auto_bucketing.py              bucketing helpers
+  async_tp/                      async tensor-parallel passes
+  autobucketing_inductor/        Inductor-oriented bucketing passes
+
+docs/                            user and contributor documentation
+examples/                        runnable examples
+tests/                           behavior and regression tests
+```
+
+## How to Read the Code
+
+For a first pass, read in this order:
+
+1. `docs/basic_concepts.md`
+2. `docs/api_walkthrough.md`
+3. `autoparallel/api.py`
+4. `autoparallel/optimize_sharding.py`
+5. `autoparallel/shardings/placement_options.py`
+6. `autoparallel/shardings/propagation_rules.py`
+7. `autoparallel/apply_sharding.py`
+8. `autoparallel/module_construction.py`
+9. `autoparallel/compile.py`
+
+Then use tests to understand edge cases:
+
+- `tests/test_api.py`
+- `tests/test_auto_parallel_simple.py`
+- `tests/test_optimize_placement.py`
+- `tests/test_propagation_rules.py`
+- `tests/test_apply_sharding.py`
+- `tests/test_dynamic_shapes.py`
+- `tests/test_flex_attention.py`
+- `tests/test_inference_path.py`
+
+## Debugging Workflow
+
+When investigating a model or optimizer decision:
+
+1. Start with the full `AutoParallel` API instead of `auto_parallel(...)`.
+2. Add explicit input and output constraints.
+3. Add a parameter memory constraint if you expect FSDP-like sharding.
+4. Call `optimize_placement(verbose=True)`.
+5. Read the optimizer log for chosen placements and cost breakdowns.
+6. Use `print_costs_for_node(...)` for a suspicious node.
+7. Use `explain_placement(...)` to compare a target placement with the chosen
+   placement.
+8. Temporarily add a node constraint and compare with `diff_solutions(...)`.
+9. Inspect the parallel graph emitted by structured logs or `parallel_gm`.
+
+Common symptoms:
+
+- Replicated parameters: missing or loose parameter memory constraint.
+- Infeasible ILP: contradictory input/output/node constraints or shard dim too
+  small for the mesh.
+- Unexpected all-gather/all-reduce: producer and consumer placements disagree.
+- Shape mismatch at runtime: passing global tensors to a module that expects
+  local tensors.
+- Dynamic-shape compile failure: check whether symbolic dims were concretized
+  too early or local shape args were not localized.
+
+## Contributor Notes
+
+- Prefer existing DTensor strategy APIs before adding custom propagation rules.
+- Add custom rules only when the default rule is missing or does not preserve
+  the metadata AutoParallel needs.
+- Keep optimizer constraints explicit; hidden state makes debugging ILP failures
+  difficult.
+- Add focused tests when touching strategy enumeration, cost modeling,
+  constraints, or graph lowering.
+- Be careful with aliases: parameters, buffers, and modules can share identity,
+  and the code intentionally preserves those relationships.
+- The traced graph uses global shapes; the returned module executes on local
+  tensors. Many bugs come from mixing those two worlds.
diff --git a/examples/example_llama3.py b/examples/example_llama3.py
index 3903ba8d..5c09bd43 100644
--- a/examples/example_llama3.py
+++ b/examples/example_llama3.py
@@ -8,10 +8,6 @@
 from functools import partial
 
 import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Partial, Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
 from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
 from autoparallel.api import AutoParallel
 from autoparallel.compile import autoparallel_backend
@@ -24,6 +20,9 @@
 )
 from autoparallel.graph_passes.debug_helpers import make_custom_runtime_estimation
 from autoparallel.graph_passes.estimate_graph_metrics import estimate_graph_metrics
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Partial, Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
 
 logging.basicConfig(level=logging.DEBUG)
 
diff --git a/profile_results/llama3_3b_ilp_node_indegree_distribution.svg b/profile_results/llama3_3b_ilp_node_indegree_distribution.svg
new file mode 100644
index 00000000..d722fd85
--- /dev/null
+++ b/profile_results/llama3_3b_ilp_node_indegree_distribution.svg
@@ -0,0 +1,51 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="980" height="620" viewBox="0 0 980 620">
+<rect width="100%" height="100%" fill="#ffffff"/>
+<style>
+text { font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; fill: #111827; }
+.title { font-size: 24px; font-weight: 700; }
+.subtitle { font-size: 13px; fill: #4b5563; }
+.axis { stroke: #111827; stroke-width: 1.2; }
+.grid { stroke: #e5e7eb; stroke-width: 1; }
+.tick { font-size: 12px; fill: #374151; }
+.label { font-size: 13px; font-weight: 600; }
+.note { font-size: 12px; fill: #6b7280; }
+</style>
+<text class="title" x="92" y="42">AutoParallel ILP Node In-Degree Distribution</text>
+<text class="subtitle" x="92" y="66">LLaMA3 3B, mesh=(64,), repeated_subgraphs=True; raw optimizer DAG, no manual cluster collapse</text>
+<text class="subtitle" x="92" y="84">Nodes excluding output: 7199; unique direct dependency edges: 8805</text>
+<line class="grid" x1="92" y1="524.0" x2="938" y2="524.0"/>
+<text class="tick" x="80" y="528.0" text-anchor="end">1</text>
+<line class="grid" x1="92" y1="419.0" x2="938" y2="419.0"/>
+<text class="tick" x="80" y="423.0" text-anchor="end">10</text>
+<line class="grid" x1="92" y1="314.0" x2="938" y2="314.0"/>
+<text class="tick" x="80" y="318.0" text-anchor="end">100</text>
+<line class="grid" x1="92" y1="209.0" x2="938" y2="209.0"/>
+<text class="tick" x="80" y="213.0" text-anchor="end">1000</text>
+<line class="grid" x1="92" y1="104.0" x2="938" y2="104.0"/>
+<text class="tick" x="80" y="108.0" text-anchor="end">10000</text>
+<line class="axis" x1="92" y1="524" x2="938" y2="524"/>
+<line class="axis" x1="92" y1="104" x2="92" y2="524"/>
+<text class="tick" x="515.0" y="586" text-anchor="middle">direct dependency nodes / in-degree</text>
+<text class="tick" transform="translate(26 314.0) rotate(-90)" text-anchor="middle">node count, log scale</text>
+<rect x="140.2" y="271.0" width="92.0" height="253.0" rx="4" fill="#64748b"/>
+<text class="label" x="186.2" y="247.0" text-anchor="middle">257</text>
+<text class="note" x="186.2" y="263.0" text-anchor="middle">3.57%</text>
+<text class="tick" x="186.2" y="550.0" text-anchor="middle">0</text>
+<rect x="328.8" y="133.2" width="92.0" height="390.8" rx="4" fill="#2563eb"/>
+<text class="label" x="374.8" y="109.2" text-anchor="middle">5275</text>
+<text class="note" x="374.8" y="125.2" text-anchor="middle">73.27%</text>
+<text class="tick" x="374.8" y="550.0" text-anchor="middle">1</text>
+<rect x="517.2" y="187.3" width="92.0" height="336.7" rx="4" fill="#0f766e"/>
+<text class="label" x="563.2" y="163.3" text-anchor="middle">1611</text>
+<text class="note" x="563.2" y="179.3" text-anchor="middle">22.38%</text>
+<text class="tick" x="563.2" y="550.0" text-anchor="middle">2</text>
+<rect x="705.8" y="372.0" width="92.0" height="152.0" rx="4" fill="#d97706"/>
+<text class="label" x="751.8" y="348.0" text-anchor="middle">28</text>
+<text class="note" x="751.8" y="364.0" text-anchor="middle">0.39%</text>
+<text class="tick" x="751.8" y="550.0" text-anchor="middle">3</text>
+<rect x="894.2" y="372.0" width="92.0" height="152.0" rx="4" fill="#dc2626"/>
+<text class="label" x="940.2" y="348.0" text-anchor="middle">28</text>
+<text class="note" x="940.2" y="364.0" text-anchor="middle">0.39%</text>
+<text class="tick" x="940.2" y="550.0" text-anchor="middle">8</text>
+<text class="note" x="92" y="606">Histogram: 0-&gt;257, 1-&gt;5275, 2-&gt;1611, 3-&gt;28, 8-&gt;28</text>
+</svg>
\ No newline at end of file
diff --git a/profile_results/llama3_8b_4x4_strategy_full.json b/profile_results/llama3_8b_4x4_strategy_full.json
new file mode 100644
index 00000000..88f58ae3
--- /dev/null
+++ b/profile_results/llama3_8b_4x4_strategy_full.json
@@ -0,0 +1,287470 @@
+{
+  "mesh": {
+    "dim_names": [
+      "dp",
+      "tp"
+    ],
+    "shape": [
+      4,
+      4
+    ]
+  },
+  "nodes": [
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "tok_embeddings.weight",
+      "name": "primals_1",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(1)S(1)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.0.attention.wq.weight",
+      "name": "primals_2",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.0.attention.wk.weight",
+      "name": "primals_3",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.0.attention.wv.weight",
+      "name": "primals_4",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.0.attention.wo.weight",
+      "name": "primals_5",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.0.feed_forward.w1.weight",
+      "name": "primals_6",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.0.feed_forward.w2.weight",
+      "name": "primals_7",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.0.feed_forward.w3.weight",
+      "name": "primals_8",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.0.attention_norm.weight",
+      "name": "primals_9",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.0.ffn_norm.weight",
+      "name": "primals_10",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.1.attention.wq.weight",
+      "name": "primals_11",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.1.attention.wk.weight",
+      "name": "primals_12",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.1.attention.wv.weight",
+      "name": "primals_13",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.1.attention.wo.weight",
+      "name": "primals_14",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.1.feed_forward.w1.weight",
+      "name": "primals_15",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.1.feed_forward.w2.weight",
+      "name": "primals_16",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.1.feed_forward.w3.weight",
+      "name": "primals_17",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.1.attention_norm.weight",
+      "name": "primals_18",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.1.ffn_norm.weight",
+      "name": "primals_19",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.2.attention.wq.weight",
+      "name": "primals_20",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.2.attention.wk.weight",
+      "name": "primals_21",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.2.attention.wv.weight",
+      "name": "primals_22",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.2.attention.wo.weight",
+      "name": "primals_23",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.2.feed_forward.w1.weight",
+      "name": "primals_24",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.2.feed_forward.w2.weight",
+      "name": "primals_25",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.2.feed_forward.w3.weight",
+      "name": "primals_26",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.2.attention_norm.weight",
+      "name": "primals_27",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.2.ffn_norm.weight",
+      "name": "primals_28",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.3.attention.wq.weight",
+      "name": "primals_29",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.3.attention.wk.weight",
+      "name": "primals_30",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.3.attention.wv.weight",
+      "name": "primals_31",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.3.attention.wo.weight",
+      "name": "primals_32",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.3.feed_forward.w1.weight",
+      "name": "primals_33",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.3.feed_forward.w2.weight",
+      "name": "primals_34",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.3.feed_forward.w3.weight",
+      "name": "primals_35",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.3.attention_norm.weight",
+      "name": "primals_36",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.3.ffn_norm.weight",
+      "name": "primals_37",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.4.attention.wq.weight",
+      "name": "primals_38",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.4.attention.wk.weight",
+      "name": "primals_39",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.4.attention.wv.weight",
+      "name": "primals_40",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.4.attention.wo.weight",
+      "name": "primals_41",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.4.feed_forward.w1.weight",
+      "name": "primals_42",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.4.feed_forward.w2.weight",
+      "name": "primals_43",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.4.feed_forward.w3.weight",
+      "name": "primals_44",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.4.attention_norm.weight",
+      "name": "primals_45",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.4.ffn_norm.weight",
+      "name": "primals_46",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.5.attention.wq.weight",
+      "name": "primals_47",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.5.attention.wk.weight",
+      "name": "primals_48",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.5.attention.wv.weight",
+      "name": "primals_49",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.5.attention.wo.weight",
+      "name": "primals_50",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.5.feed_forward.w1.weight",
+      "name": "primals_51",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.5.feed_forward.w2.weight",
+      "name": "primals_52",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.5.feed_forward.w3.weight",
+      "name": "primals_53",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.5.attention_norm.weight",
+      "name": "primals_54",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.5.ffn_norm.weight",
+      "name": "primals_55",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.6.attention.wq.weight",
+      "name": "primals_56",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.6.attention.wk.weight",
+      "name": "primals_57",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.6.attention.wv.weight",
+      "name": "primals_58",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.6.attention.wo.weight",
+      "name": "primals_59",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.6.feed_forward.w1.weight",
+      "name": "primals_60",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.6.feed_forward.w2.weight",
+      "name": "primals_61",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.6.feed_forward.w3.weight",
+      "name": "primals_62",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.6.attention_norm.weight",
+      "name": "primals_63",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.6.ffn_norm.weight",
+      "name": "primals_64",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.7.attention.wq.weight",
+      "name": "primals_65",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.7.attention.wk.weight",
+      "name": "primals_66",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.7.attention.wv.weight",
+      "name": "primals_67",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.7.attention.wo.weight",
+      "name": "primals_68",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.7.feed_forward.w1.weight",
+      "name": "primals_69",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.7.feed_forward.w2.weight",
+      "name": "primals_70",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.7.feed_forward.w3.weight",
+      "name": "primals_71",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.7.attention_norm.weight",
+      "name": "primals_72",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.7.ffn_norm.weight",
+      "name": "primals_73",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.8.attention.wq.weight",
+      "name": "primals_74",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.8.attention.wk.weight",
+      "name": "primals_75",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.8.attention.wv.weight",
+      "name": "primals_76",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.8.attention.wo.weight",
+      "name": "primals_77",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.8.feed_forward.w1.weight",
+      "name": "primals_78",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.8.feed_forward.w2.weight",
+      "name": "primals_79",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.8.feed_forward.w3.weight",
+      "name": "primals_80",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.8.attention_norm.weight",
+      "name": "primals_81",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.8.ffn_norm.weight",
+      "name": "primals_82",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.9.attention.wq.weight",
+      "name": "primals_83",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.9.attention.wk.weight",
+      "name": "primals_84",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.9.attention.wv.weight",
+      "name": "primals_85",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.9.attention.wo.weight",
+      "name": "primals_86",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.9.feed_forward.w1.weight",
+      "name": "primals_87",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.9.feed_forward.w2.weight",
+      "name": "primals_88",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.9.feed_forward.w3.weight",
+      "name": "primals_89",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.9.attention_norm.weight",
+      "name": "primals_90",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.9.ffn_norm.weight",
+      "name": "primals_91",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.10.attention.wq.weight",
+      "name": "primals_92",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.10.attention.wk.weight",
+      "name": "primals_93",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.10.attention.wv.weight",
+      "name": "primals_94",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.10.attention.wo.weight",
+      "name": "primals_95",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.10.feed_forward.w1.weight",
+      "name": "primals_96",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.10.feed_forward.w2.weight",
+      "name": "primals_97",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.10.feed_forward.w3.weight",
+      "name": "primals_98",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.10.attention_norm.weight",
+      "name": "primals_99",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.10.ffn_norm.weight",
+      "name": "primals_100",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.11.attention.wq.weight",
+      "name": "primals_101",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.11.attention.wk.weight",
+      "name": "primals_102",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.11.attention.wv.weight",
+      "name": "primals_103",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.11.attention.wo.weight",
+      "name": "primals_104",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.11.feed_forward.w1.weight",
+      "name": "primals_105",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.11.feed_forward.w2.weight",
+      "name": "primals_106",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.11.feed_forward.w3.weight",
+      "name": "primals_107",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.11.attention_norm.weight",
+      "name": "primals_108",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.11.ffn_norm.weight",
+      "name": "primals_109",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.12.attention.wq.weight",
+      "name": "primals_110",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.12.attention.wk.weight",
+      "name": "primals_111",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.12.attention.wv.weight",
+      "name": "primals_112",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.12.attention.wo.weight",
+      "name": "primals_113",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.12.feed_forward.w1.weight",
+      "name": "primals_114",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.12.feed_forward.w2.weight",
+      "name": "primals_115",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.12.feed_forward.w3.weight",
+      "name": "primals_116",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.12.attention_norm.weight",
+      "name": "primals_117",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.12.ffn_norm.weight",
+      "name": "primals_118",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.13.attention.wq.weight",
+      "name": "primals_119",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.13.attention.wk.weight",
+      "name": "primals_120",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.13.attention.wv.weight",
+      "name": "primals_121",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.13.attention.wo.weight",
+      "name": "primals_122",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.13.feed_forward.w1.weight",
+      "name": "primals_123",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.13.feed_forward.w2.weight",
+      "name": "primals_124",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.13.feed_forward.w3.weight",
+      "name": "primals_125",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.13.attention_norm.weight",
+      "name": "primals_126",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.13.ffn_norm.weight",
+      "name": "primals_127",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.14.attention.wq.weight",
+      "name": "primals_128",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.14.attention.wk.weight",
+      "name": "primals_129",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.14.attention.wv.weight",
+      "name": "primals_130",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.14.attention.wo.weight",
+      "name": "primals_131",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.14.feed_forward.w1.weight",
+      "name": "primals_132",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.14.feed_forward.w2.weight",
+      "name": "primals_133",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.14.feed_forward.w3.weight",
+      "name": "primals_134",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.14.attention_norm.weight",
+      "name": "primals_135",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.14.ffn_norm.weight",
+      "name": "primals_136",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.15.attention.wq.weight",
+      "name": "primals_137",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.15.attention.wk.weight",
+      "name": "primals_138",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.15.attention.wv.weight",
+      "name": "primals_139",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.15.attention.wo.weight",
+      "name": "primals_140",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.15.feed_forward.w1.weight",
+      "name": "primals_141",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.15.feed_forward.w2.weight",
+      "name": "primals_142",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.15.feed_forward.w3.weight",
+      "name": "primals_143",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.15.attention_norm.weight",
+      "name": "primals_144",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.15.ffn_norm.weight",
+      "name": "primals_145",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.16.attention.wq.weight",
+      "name": "primals_146",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.16.attention.wk.weight",
+      "name": "primals_147",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.16.attention.wv.weight",
+      "name": "primals_148",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.16.attention.wo.weight",
+      "name": "primals_149",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.16.feed_forward.w1.weight",
+      "name": "primals_150",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.16.feed_forward.w2.weight",
+      "name": "primals_151",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.16.feed_forward.w3.weight",
+      "name": "primals_152",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.16.attention_norm.weight",
+      "name": "primals_153",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.16.ffn_norm.weight",
+      "name": "primals_154",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.17.attention.wq.weight",
+      "name": "primals_155",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.17.attention.wk.weight",
+      "name": "primals_156",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.17.attention.wv.weight",
+      "name": "primals_157",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.17.attention.wo.weight",
+      "name": "primals_158",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.17.feed_forward.w1.weight",
+      "name": "primals_159",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.17.feed_forward.w2.weight",
+      "name": "primals_160",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.17.feed_forward.w3.weight",
+      "name": "primals_161",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.17.attention_norm.weight",
+      "name": "primals_162",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.17.ffn_norm.weight",
+      "name": "primals_163",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.18.attention.wq.weight",
+      "name": "primals_164",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.18.attention.wk.weight",
+      "name": "primals_165",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.18.attention.wv.weight",
+      "name": "primals_166",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.18.attention.wo.weight",
+      "name": "primals_167",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.18.feed_forward.w1.weight",
+      "name": "primals_168",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.18.feed_forward.w2.weight",
+      "name": "primals_169",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.18.feed_forward.w3.weight",
+      "name": "primals_170",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.18.attention_norm.weight",
+      "name": "primals_171",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.18.ffn_norm.weight",
+      "name": "primals_172",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.19.attention.wq.weight",
+      "name": "primals_173",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.19.attention.wk.weight",
+      "name": "primals_174",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.19.attention.wv.weight",
+      "name": "primals_175",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.19.attention.wo.weight",
+      "name": "primals_176",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.19.feed_forward.w1.weight",
+      "name": "primals_177",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.19.feed_forward.w2.weight",
+      "name": "primals_178",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.19.feed_forward.w3.weight",
+      "name": "primals_179",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.19.attention_norm.weight",
+      "name": "primals_180",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.19.ffn_norm.weight",
+      "name": "primals_181",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.20.attention.wq.weight",
+      "name": "primals_182",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.20.attention.wk.weight",
+      "name": "primals_183",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.20.attention.wv.weight",
+      "name": "primals_184",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.20.attention.wo.weight",
+      "name": "primals_185",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.20.feed_forward.w1.weight",
+      "name": "primals_186",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.20.feed_forward.w2.weight",
+      "name": "primals_187",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.20.feed_forward.w3.weight",
+      "name": "primals_188",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.20.attention_norm.weight",
+      "name": "primals_189",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.20.ffn_norm.weight",
+      "name": "primals_190",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.21.attention.wq.weight",
+      "name": "primals_191",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.21.attention.wk.weight",
+      "name": "primals_192",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.21.attention.wv.weight",
+      "name": "primals_193",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.21.attention.wo.weight",
+      "name": "primals_194",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.21.feed_forward.w1.weight",
+      "name": "primals_195",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.21.feed_forward.w2.weight",
+      "name": "primals_196",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.21.feed_forward.w3.weight",
+      "name": "primals_197",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.21.attention_norm.weight",
+      "name": "primals_198",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.21.ffn_norm.weight",
+      "name": "primals_199",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.22.attention.wq.weight",
+      "name": "primals_200",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.22.attention.wk.weight",
+      "name": "primals_201",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.22.attention.wv.weight",
+      "name": "primals_202",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.22.attention.wo.weight",
+      "name": "primals_203",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.22.feed_forward.w1.weight",
+      "name": "primals_204",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.22.feed_forward.w2.weight",
+      "name": "primals_205",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.22.feed_forward.w3.weight",
+      "name": "primals_206",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.22.attention_norm.weight",
+      "name": "primals_207",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.22.ffn_norm.weight",
+      "name": "primals_208",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.23.attention.wq.weight",
+      "name": "primals_209",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.23.attention.wk.weight",
+      "name": "primals_210",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.23.attention.wv.weight",
+      "name": "primals_211",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.23.attention.wo.weight",
+      "name": "primals_212",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.23.feed_forward.w1.weight",
+      "name": "primals_213",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.23.feed_forward.w2.weight",
+      "name": "primals_214",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.23.feed_forward.w3.weight",
+      "name": "primals_215",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.23.attention_norm.weight",
+      "name": "primals_216",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.23.ffn_norm.weight",
+      "name": "primals_217",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.24.attention.wq.weight",
+      "name": "primals_218",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.24.attention.wk.weight",
+      "name": "primals_219",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.24.attention.wv.weight",
+      "name": "primals_220",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.24.attention.wo.weight",
+      "name": "primals_221",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.24.feed_forward.w1.weight",
+      "name": "primals_222",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.24.feed_forward.w2.weight",
+      "name": "primals_223",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.24.feed_forward.w3.weight",
+      "name": "primals_224",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.24.attention_norm.weight",
+      "name": "primals_225",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.24.ffn_norm.weight",
+      "name": "primals_226",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.25.attention.wq.weight",
+      "name": "primals_227",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.25.attention.wk.weight",
+      "name": "primals_228",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.25.attention.wv.weight",
+      "name": "primals_229",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.25.attention.wo.weight",
+      "name": "primals_230",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.25.feed_forward.w1.weight",
+      "name": "primals_231",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.25.feed_forward.w2.weight",
+      "name": "primals_232",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.25.feed_forward.w3.weight",
+      "name": "primals_233",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.25.attention_norm.weight",
+      "name": "primals_234",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.25.ffn_norm.weight",
+      "name": "primals_235",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.26.attention.wq.weight",
+      "name": "primals_236",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.26.attention.wk.weight",
+      "name": "primals_237",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.26.attention.wv.weight",
+      "name": "primals_238",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.26.attention.wo.weight",
+      "name": "primals_239",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.26.feed_forward.w1.weight",
+      "name": "primals_240",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.26.feed_forward.w2.weight",
+      "name": "primals_241",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.26.feed_forward.w3.weight",
+      "name": "primals_242",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.26.attention_norm.weight",
+      "name": "primals_243",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.26.ffn_norm.weight",
+      "name": "primals_244",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.27.attention.wq.weight",
+      "name": "primals_245",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.27.attention.wk.weight",
+      "name": "primals_246",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.27.attention.wv.weight",
+      "name": "primals_247",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.27.attention.wo.weight",
+      "name": "primals_248",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.27.feed_forward.w1.weight",
+      "name": "primals_249",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.27.feed_forward.w2.weight",
+      "name": "primals_250",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.27.feed_forward.w3.weight",
+      "name": "primals_251",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.27.attention_norm.weight",
+      "name": "primals_252",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.27.ffn_norm.weight",
+      "name": "primals_253",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.28.attention.wq.weight",
+      "name": "primals_254",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.28.attention.wk.weight",
+      "name": "primals_255",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.28.attention.wv.weight",
+      "name": "primals_256",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.28.attention.wo.weight",
+      "name": "primals_257",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.28.feed_forward.w1.weight",
+      "name": "primals_258",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.28.feed_forward.w2.weight",
+      "name": "primals_259",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.28.feed_forward.w3.weight",
+      "name": "primals_260",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.28.attention_norm.weight",
+      "name": "primals_261",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.28.ffn_norm.weight",
+      "name": "primals_262",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.29.attention.wq.weight",
+      "name": "primals_263",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.29.attention.wk.weight",
+      "name": "primals_264",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.29.attention.wv.weight",
+      "name": "primals_265",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.29.attention.wo.weight",
+      "name": "primals_266",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.29.feed_forward.w1.weight",
+      "name": "primals_267",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.29.feed_forward.w2.weight",
+      "name": "primals_268",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.29.feed_forward.w3.weight",
+      "name": "primals_269",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.29.attention_norm.weight",
+      "name": "primals_270",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.29.ffn_norm.weight",
+      "name": "primals_271",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.30.attention.wq.weight",
+      "name": "primals_272",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.30.attention.wk.weight",
+      "name": "primals_273",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.30.attention.wv.weight",
+      "name": "primals_274",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.30.attention.wo.weight",
+      "name": "primals_275",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.30.feed_forward.w1.weight",
+      "name": "primals_276",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.30.feed_forward.w2.weight",
+      "name": "primals_277",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.30.feed_forward.w3.weight",
+      "name": "primals_278",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.30.attention_norm.weight",
+      "name": "primals_279",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.30.ffn_norm.weight",
+      "name": "primals_280",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.31.attention.wq.weight",
+      "name": "primals_281",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.31.attention.wk.weight",
+      "name": "primals_282",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.31.attention.wv.weight",
+      "name": "primals_283",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.31.attention.wo.weight",
+      "name": "primals_284",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.31.feed_forward.w1.weight",
+      "name": "primals_285",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.31.feed_forward.w2.weight",
+      "name": "primals_286",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.31.feed_forward.w3.weight",
+      "name": "primals_287",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.31.attention_norm.weight",
+      "name": "primals_288",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "layers.31.ffn_norm.weight",
+      "name": "primals_289",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "norm.weight",
+      "name": "primals_290",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [],
+      "module_path": "output.weight",
+      "name": "primals_291",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "param",
+      "placement": "S(0)S(0)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [],
+      "module_path": "freqs_cis",
+      "name": "primals_292",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "buffer",
+      "placement": "RR",
+      "shape": [
+        8192,
+        64
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "int64",
+      "inputs": [],
+      "name": "primals_293",
+      "op": "placeholder",
+      "phase": "forward",
+      "placeholder_kind": "input",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [],
+      "name": "tangents_1",
+      "op": "placeholder",
+      "phase": "backward",
+      "placeholder_kind": "tangent",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        128256
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 76.40578345195063,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(1)S(1)",
+          "name": "primals_1",
+          "src_placement": "S(1)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].tok_embeddings",
+      "name": "dtype_cast",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(1)S(1)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "int64",
+      "inputs": [
+        {
+          "comm_cost": 21.38246153846154,
+          "dst_placement": "RR",
+          "name": "primals_293",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].tok_embeddings",
+      "name": "alias_default_1",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        8,
+        8192
+      ],
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 38.685829146330285,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(1)S(1)",
+          "name": "dtype_cast",
+          "src_placement": "S(1)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_1",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].tok_embeddings",
+      "name": "embedding",
+      "op": "aten.embedding.default",
+      "phase": "forward",
+      "placement": "S(2)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 539
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 0,
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_9",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "dtype_cast_1",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 706.2108351658422,
+          "dst_placement": "S(0)S(1)",
+          "name": "embedding",
+          "src_placement": "S(2)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].tok_embeddings",
+      "name": "alias_default_3",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 539
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 1,
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "convert_element_type",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "alias_default_5",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "pow_1",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "mean",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "add",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "rsqrt",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "alias_default_6",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "mul",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_1",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "alias_default_4",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_4",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "mul_1",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "convert_element_type_1",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_2",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wq",
+      "name": "dtype_cast_2",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_2",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wq",
+      "name": "permute",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "alias_default_7",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wq",
+      "name": "alias_default_8",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_7",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_8",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wq",
+      "name": "einsum_default",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_3",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wk",
+      "name": "dtype_cast_3",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_3",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wk",
+      "name": "permute_1",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wk",
+      "name": "alias_default_9",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_7",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_9",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wk",
+      "name": "einsum_default_1",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_4",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wv",
+      "name": "dtype_cast_4",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_4",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wv",
+      "name": "permute_2",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_2",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wv",
+      "name": "alias_default_10",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_7",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_10",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wv",
+      "name": "einsum_default_2",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_6",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_7",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_8",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "convert_element_type_8",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_9",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_as_complex",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "convert_element_type_9",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_10",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_as_complex_1",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "primals_292",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "name": "alias_default",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        8192,
+        64
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_11",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_11",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "alias_default_11",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_11",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "mul_2",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_as_real",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_12",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_11",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "mul_3",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_as_real_1",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_13",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "convert_element_type_10",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "convert_element_type_11",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "unsqueeze",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "expand",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "clone",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_14",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "unsqueeze_1",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "expand_1",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "clone_1",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_15",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "permute_3",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "permute_4",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "permute_5",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "alias_default_12",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "alias_default_13",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "alias_default_14",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.sdpa",
+      "name": "getitem",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.sdpa",
+      "name": "getitem_1",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.sdpa",
+      "name": "getitem_6",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.sdpa",
+      "name": "getitem_7",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.sdpa",
+      "name": "alias_default_15",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "permute_6",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_16",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_5",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "dtype_cast_5",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_5",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "permute_7",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "alias_default_16",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_7",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "alias_default_17",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_17",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "einsum_default_3",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0",
+      "name": "add_1",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_10",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "dtype_cast_6",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0",
+      "name": "alias_default_18",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "convert_element_type_14",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "alias_default_20",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "pow_2",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_2",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "mean_1",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "add_2",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_2",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "rsqrt_1",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "alias_default_21",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "mul_4",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_6",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "alias_default_19",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_19",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "mul_5",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "convert_element_type_15",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_6",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "dtype_cast_7",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_7",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "permute_8",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "alias_default_22",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_8",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "alias_default_23",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_22",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_23",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "einsum_default_4",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "alias_default_24",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "convert_element_type_18",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "alias_default_25",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "neg",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "exp",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "add_3",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "div",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "convert_element_type_19",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_8",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "dtype_cast_8",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_8",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "permute_9",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_9",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "alias_default_27",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_22",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_27",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "einsum_default_5",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "alias_default_26",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "alias_default_28",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "mul_6",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "dtype_cast_9",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "permute_10",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "alias_default_29",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_10",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "alias_default_30",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_30",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "einsum_default_6",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_6",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0",
+      "name": "add_4",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_18",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "dtype_cast_10",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0",
+      "name": "alias_default_31",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "convert_element_type_24",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "alias_default_33",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_33",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "pow_3",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "mean_2",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_2",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "add_5",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "rsqrt_2",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_2",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "alias_default_34",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_33",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_34",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "mul_7",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_10",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "alias_default_32",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_32",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "mul_8",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_8",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "convert_element_type_25",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_11",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wq",
+      "name": "dtype_cast_11",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_11",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wq",
+      "name": "permute_11",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "alias_default_35",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_11",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wq",
+      "name": "alias_default_36",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_35",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_36",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wq",
+      "name": "einsum_default_7",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_12",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wk",
+      "name": "dtype_cast_12",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_12",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wk",
+      "name": "permute_12",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_12",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wk",
+      "name": "alias_default_37",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_35",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_37",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wk",
+      "name": "einsum_default_8",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_13",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wv",
+      "name": "dtype_cast_13",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_13",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wv",
+      "name": "permute_13",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_13",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wv",
+      "name": "alias_default_38",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_35",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_38",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wv",
+      "name": "einsum_default_9",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_31",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_32",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_33",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "convert_element_type_32",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_34",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_as_complex_2",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "convert_element_type_33",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_35",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_as_complex_3",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_36",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_36",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "alias_default_39",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_39",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "mul_9",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_as_real_2",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_37",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_39",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "mul_10",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_as_real_3",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_38",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "convert_element_type_34",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "convert_element_type_35",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "unsqueeze_2",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "expand_2",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "clone_2",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_39",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "unsqueeze_3",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "expand_3",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "clone_3",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_40",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "permute_14",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "permute_15",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "permute_16",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "alias_default_40",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "alias_default_41",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "alias_default_42",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_40",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_41",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_42",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_1",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.sdpa",
+      "name": "getitem_9",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.sdpa",
+      "name": "getitem_10",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_1",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.sdpa",
+      "name": "getitem_15",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_1",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.sdpa",
+      "name": "getitem_16",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.sdpa",
+      "name": "alias_default_43",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_43",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "permute_17",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_41",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_14",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "dtype_cast_14",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_14",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "permute_18",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_41",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "alias_default_44",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_18",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "alias_default_45",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_44",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_45",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "einsum_default_10",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1",
+      "name": "add_6",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_19",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "dtype_cast_15",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1",
+      "name": "alias_default_46",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_46",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "convert_element_type_38",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_38",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "alias_default_48",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_48",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "pow_4",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "mean_3",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "add_7",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "rsqrt_3",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "alias_default_49",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_48",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "mul_11",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_15",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "alias_default_47",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_47",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "mul_12",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "convert_element_type_39",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_15",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "dtype_cast_16",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_16",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "permute_19",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_39",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "alias_default_50",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_19",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "alias_default_51",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_50",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_51",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "einsum_default_11",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "alias_default_52",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "convert_element_type_42",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "alias_default_53",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "neg_1",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "exp_1",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "add_8",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "div_1",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "convert_element_type_43",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_17",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "dtype_cast_17",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_17",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "permute_20",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_20",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "alias_default_55",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_50",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_55",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "einsum_default_12",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "alias_default_54",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "alias_default_56",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "mul_13",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "dtype_cast_18",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "permute_21",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "alias_default_57",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_21",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "alias_default_58",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_58",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "einsum_default_13",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_46",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_13",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1",
+      "name": "add_9",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_27",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "dtype_cast_19",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1",
+      "name": "alias_default_59",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "convert_element_type_48",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_48",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "alias_default_61",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_61",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "pow_5",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "mean_4",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "add_10",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "rsqrt_4",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "alias_default_62",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_61",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_62",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "mul_14",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_19",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "alias_default_60",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_60",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "mul_15",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "convert_element_type_49",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_20",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wq",
+      "name": "dtype_cast_20",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_20",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wq",
+      "name": "permute_22",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "alias_default_63",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_22",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wq",
+      "name": "alias_default_64",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_63",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_64",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wq",
+      "name": "einsum_default_14",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_21",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wk",
+      "name": "dtype_cast_21",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_21",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wk",
+      "name": "permute_23",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_23",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wk",
+      "name": "alias_default_65",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_63",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_65",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wk",
+      "name": "einsum_default_15",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_22",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wv",
+      "name": "dtype_cast_22",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_22",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wv",
+      "name": "permute_24",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_24",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wv",
+      "name": "alias_default_66",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_63",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_66",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wv",
+      "name": "einsum_default_16",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_56",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_57",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_58",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "convert_element_type_56",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_59",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_as_complex_4",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "convert_element_type_57",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_60",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_as_complex_5",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_61",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_61",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "alias_default_67",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_67",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "mul_16",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_as_real_4",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_62",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_67",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "mul_17",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_as_real_5",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_63",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_62",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "convert_element_type_58",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "convert_element_type_59",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "unsqueeze_4",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "expand_4",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "clone_4",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_64",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "unsqueeze_5",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "expand_5",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "clone_5",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_65",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "permute_25",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_64",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "permute_26",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_65",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "permute_27",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "alias_default_68",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "alias_default_69",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "alias_default_70",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_68",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_69",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_70",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_2",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_2",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.sdpa",
+      "name": "getitem_18",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_2",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.sdpa",
+      "name": "getitem_19",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_2",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.sdpa",
+      "name": "getitem_24",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_2",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.sdpa",
+      "name": "getitem_25",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.sdpa",
+      "name": "alias_default_71",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_71",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "permute_28",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_66",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_23",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "dtype_cast_23",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_23",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "permute_29",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_66",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "alias_default_72",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_29",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "alias_default_73",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_72",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_73",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "einsum_default_17",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2",
+      "name": "add_11",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_28",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "dtype_cast_24",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2",
+      "name": "alias_default_74",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_74",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "convert_element_type_62",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_62",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "alias_default_76",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_76",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "pow_6",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "mean_5",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "add_12",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "rsqrt_5",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "alias_default_77",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_76",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_77",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "mul_18",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_24",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "alias_default_75",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_75",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "mul_19",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "convert_element_type_63",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_24",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "dtype_cast_25",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_25",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "permute_30",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_63",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "alias_default_78",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_30",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "alias_default_79",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_78",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_79",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "einsum_default_18",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "alias_default_80",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_80",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "convert_element_type_66",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_66",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "alias_default_81",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_81",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "neg_2",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "exp_2",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "add_13",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_81",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "div_2",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "convert_element_type_67",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_26",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "dtype_cast_26",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_26",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "permute_31",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_31",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "alias_default_83",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_78",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_83",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "einsum_default_19",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_67",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "alias_default_82",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "alias_default_84",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_82",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_84",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "mul_20",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "dtype_cast_27",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "permute_32",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "alias_default_85",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_32",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "alias_default_86",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_85",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_86",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "einsum_default_20",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_74",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_20",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2",
+      "name": "add_14",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_36",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "dtype_cast_28",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2",
+      "name": "alias_default_87",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_87",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "convert_element_type_72",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_72",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "alias_default_89",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_89",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "pow_7",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "mean_6",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "add_15",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "rsqrt_6",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "alias_default_90",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_89",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_90",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "mul_21",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_28",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "alias_default_88",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_88",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "mul_22",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_22",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "convert_element_type_73",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_29",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wq",
+      "name": "dtype_cast_29",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_29",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wq",
+      "name": "permute_33",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_73",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "alias_default_91",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_33",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wq",
+      "name": "alias_default_92",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_91",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_92",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wq",
+      "name": "einsum_default_21",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_30",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wk",
+      "name": "dtype_cast_30",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_30",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wk",
+      "name": "permute_34",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_34",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wk",
+      "name": "alias_default_93",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_91",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_93",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wk",
+      "name": "einsum_default_22",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_31",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wv",
+      "name": "dtype_cast_31",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_31",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wv",
+      "name": "permute_35",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_35",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wv",
+      "name": "alias_default_94",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_91",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_94",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wv",
+      "name": "einsum_default_23",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_81",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_82",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_83",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_81",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "convert_element_type_80",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_80",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_84",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_84",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_as_complex_6",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_82",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "convert_element_type_81",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_81",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_85",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_85",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_as_complex_7",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_86",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_86",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "alias_default_95",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_95",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "mul_23",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_as_real_6",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_87",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_95",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "mul_24",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_as_real_7",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_88",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_87",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "convert_element_type_82",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_88",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "convert_element_type_83",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_83",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "unsqueeze_6",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "expand_6",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "clone_6",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_89",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_83",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "unsqueeze_7",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "expand_7",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "clone_7",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_90",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_82",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "permute_36",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_89",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "permute_37",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_90",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "permute_38",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_36",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "alias_default_96",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_37",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "alias_default_97",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_38",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "alias_default_98",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_96",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_97",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_98",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_3",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.sdpa",
+      "name": "getitem_27",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.sdpa",
+      "name": "getitem_28",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_3",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.sdpa",
+      "name": "getitem_33",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_3",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.sdpa",
+      "name": "getitem_34",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.sdpa",
+      "name": "alias_default_99",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_99",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "permute_39",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_91",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_32",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "dtype_cast_32",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_32",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "permute_40",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_91",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "alias_default_100",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_40",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "alias_default_101",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_100",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_101",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "einsum_default_24",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_87",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3",
+      "name": "add_16",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_37",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "dtype_cast_33",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3",
+      "name": "alias_default_102",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_102",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "convert_element_type_86",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_86",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "alias_default_104",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_104",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "pow_8",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_8",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "mean_7",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "add_17",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "rsqrt_7",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "alias_default_105",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_104",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_105",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "mul_25",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_33",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "alias_default_103",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_103",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "mul_26",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "convert_element_type_87",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_33",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "dtype_cast_34",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_34",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "permute_41",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_87",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "alias_default_106",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_41",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "alias_default_107",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_106",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_107",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "einsum_default_25",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "alias_default_108",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_108",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "convert_element_type_90",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_90",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "alias_default_109",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_109",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "neg_3",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "exp_3",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "add_18",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_109",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "div_3",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "convert_element_type_91",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_35",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "dtype_cast_35",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_35",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "permute_42",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_42",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "alias_default_111",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_106",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_111",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "einsum_default_26",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_91",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "alias_default_110",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "alias_default_112",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_110",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_112",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "mul_27",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_34",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "dtype_cast_36",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_36",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "permute_43",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "alias_default_113",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_43",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "alias_default_114",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_113",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_114",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "einsum_default_27",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_102",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_27",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3",
+      "name": "add_19",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_45",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "dtype_cast_37",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3",
+      "name": "alias_default_115",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "convert_element_type_96",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_96",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "alias_default_117",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_117",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "pow_9",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "mean_8",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_8",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "add_20",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "rsqrt_8",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_8",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "alias_default_118",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_117",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_118",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "mul_28",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_37",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "alias_default_116",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_28",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_116",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "mul_29",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "convert_element_type_97",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_38",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wq",
+      "name": "dtype_cast_38",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_38",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wq",
+      "name": "permute_44",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_97",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "alias_default_119",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_44",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wq",
+      "name": "alias_default_120",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_119",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_120",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wq",
+      "name": "einsum_default_28",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_39",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wk",
+      "name": "dtype_cast_39",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_39",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wk",
+      "name": "permute_45",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_45",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wk",
+      "name": "alias_default_121",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_119",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_121",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wk",
+      "name": "einsum_default_29",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_40",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wv",
+      "name": "dtype_cast_40",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_40",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wv",
+      "name": "permute_46",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_46",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wv",
+      "name": "alias_default_122",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_119",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_122",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wv",
+      "name": "einsum_default_30",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_106",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_107",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_108",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_106",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "convert_element_type_104",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_104",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_109",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_109",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_as_complex_8",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_107",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "convert_element_type_105",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_105",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_110",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_110",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_as_complex_9",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_111",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_111",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "alias_default_123",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_123",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "mul_30",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_as_real_8",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_112",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_123",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "mul_31",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_as_real_9",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_113",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_112",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "convert_element_type_106",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_113",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "convert_element_type_107",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_107",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "unsqueeze_8",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "expand_8",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "clone_8",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_114",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_108",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "unsqueeze_9",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "expand_9",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "clone_9",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_115",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_106",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "permute_47",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_114",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "permute_48",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_115",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "permute_49",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_47",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "alias_default_124",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_48",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "alias_default_125",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "alias_default_126",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_124",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_125",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_126",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_4",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.sdpa",
+      "name": "getitem_36",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.sdpa",
+      "name": "getitem_37",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_4",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.sdpa",
+      "name": "getitem_42",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_4",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.sdpa",
+      "name": "getitem_43",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_36",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.sdpa",
+      "name": "alias_default_127",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_127",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "permute_50",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_50",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_116",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_41",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "dtype_cast_41",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_41",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "permute_51",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_116",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "alias_default_128",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_51",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "alias_default_129",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_128",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_129",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "einsum_default_31",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4",
+      "name": "add_21",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_46",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "dtype_cast_42",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4",
+      "name": "alias_default_130",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_130",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "convert_element_type_110",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_110",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "alias_default_132",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_132",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "pow_10",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "mean_9",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "add_22",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_22",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "rsqrt_9",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "alias_default_133",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_132",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_133",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "mul_32",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_42",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "alias_default_131",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_32",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_131",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "mul_33",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_33",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "convert_element_type_111",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_42",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "dtype_cast_43",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_43",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "permute_52",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_111",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "alias_default_134",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_52",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "alias_default_135",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_134",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_135",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "einsum_default_32",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "alias_default_136",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_136",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "convert_element_type_114",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_114",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "alias_default_137",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_137",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "neg_4",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "exp_4",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "add_23",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_137",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "div_4",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "convert_element_type_115",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_44",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "dtype_cast_44",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_44",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "permute_53",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_53",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "alias_default_139",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_134",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_139",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "einsum_default_33",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_115",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "alias_default_138",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "alias_default_140",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_138",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_140",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "mul_34",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_43",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "dtype_cast_45",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_45",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "permute_54",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "alias_default_141",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_54",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "alias_default_142",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_141",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_142",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "einsum_default_34",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_130",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_34",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4",
+      "name": "add_24",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_54",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "dtype_cast_46",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4",
+      "name": "alias_default_143",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_143",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "convert_element_type_120",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_120",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "alias_default_145",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_145",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "pow_11",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "mean_10",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "add_25",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "rsqrt_10",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "alias_default_146",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_145",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_146",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "mul_35",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_46",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "alias_default_144",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_35",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_144",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "mul_36",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_36",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "convert_element_type_121",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_47",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wq",
+      "name": "dtype_cast_47",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_47",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wq",
+      "name": "permute_55",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_121",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "alias_default_147",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_55",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wq",
+      "name": "alias_default_148",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_147",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_148",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wq",
+      "name": "einsum_default_35",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_48",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wk",
+      "name": "dtype_cast_48",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_48",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wk",
+      "name": "permute_56",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_56",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wk",
+      "name": "alias_default_149",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_147",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_149",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wk",
+      "name": "einsum_default_36",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_49",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wv",
+      "name": "dtype_cast_49",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_49",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wv",
+      "name": "permute_57",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_57",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wv",
+      "name": "alias_default_150",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_147",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_150",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wv",
+      "name": "einsum_default_37",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_131",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_36",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_132",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_133",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_131",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "convert_element_type_128",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_128",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_134",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_134",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_as_complex_10",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_132",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "convert_element_type_129",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_129",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_135",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_135",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_as_complex_11",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_136",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_136",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "alias_default_151",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_151",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "mul_37",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_as_real_10",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_137",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_151",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "mul_38",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_as_real_11",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_138",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_137",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "convert_element_type_130",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_138",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "convert_element_type_131",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_131",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "unsqueeze_10",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "expand_10",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "clone_10",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_139",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_133",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "unsqueeze_11",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "expand_11",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "clone_11",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_140",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_130",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "permute_58",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_139",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "permute_59",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_140",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "permute_60",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_58",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "alias_default_152",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "alias_default_153",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_60",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "alias_default_154",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_152",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_153",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_154",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_5",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.sdpa",
+      "name": "getitem_45",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.sdpa",
+      "name": "getitem_46",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_5",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.sdpa",
+      "name": "getitem_51",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_5",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.sdpa",
+      "name": "getitem_52",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_45",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.sdpa",
+      "name": "alias_default_155",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_155",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "permute_61",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_141",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_50",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "dtype_cast_50",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_50",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "permute_62",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_141",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "alias_default_156",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_62",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "alias_default_157",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_156",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_157",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "einsum_default_38",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_143",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_38",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5",
+      "name": "add_26",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_55",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "dtype_cast_51",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5",
+      "name": "alias_default_158",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_158",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "convert_element_type_134",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_134",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "alias_default_160",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_160",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "pow_12",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "mean_11",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "add_27",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "rsqrt_11",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "alias_default_161",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_160",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_161",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "mul_39",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_51",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "alias_default_159",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_39",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_159",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "mul_40",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_40",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "convert_element_type_135",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_51",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "dtype_cast_52",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_52",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "permute_63",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_135",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "alias_default_162",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_63",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "alias_default_163",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_162",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_163",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "einsum_default_39",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "alias_default_164",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_164",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "convert_element_type_138",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_138",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "alias_default_165",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_165",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "neg_5",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "exp_5",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "add_28",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_165",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "div_5",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "convert_element_type_139",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_53",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "dtype_cast_53",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_53",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "permute_64",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_64",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "alias_default_167",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_162",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_167",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "einsum_default_40",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_139",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "alias_default_166",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "alias_default_168",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_166",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_168",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "mul_41",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_52",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "dtype_cast_54",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_54",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "permute_65",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_41",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "alias_default_169",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_65",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "alias_default_170",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_169",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_170",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "einsum_default_41",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_158",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_41",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5",
+      "name": "add_29",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_63",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "dtype_cast_55",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5",
+      "name": "alias_default_171",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "convert_element_type_144",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_144",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "alias_default_173",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_173",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "pow_13",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "mean_12",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "add_30",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_30",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "rsqrt_12",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "alias_default_174",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_173",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_174",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "mul_42",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_55",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "alias_default_172",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_42",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_172",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "mul_43",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_43",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "convert_element_type_145",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_56",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wq",
+      "name": "dtype_cast_56",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_56",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wq",
+      "name": "permute_66",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_145",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "alias_default_175",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_66",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wq",
+      "name": "alias_default_176",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_175",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_176",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wq",
+      "name": "einsum_default_42",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_57",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wk",
+      "name": "dtype_cast_57",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_57",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wk",
+      "name": "permute_67",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_67",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wk",
+      "name": "alias_default_177",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_175",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_177",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wk",
+      "name": "einsum_default_43",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_58",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wv",
+      "name": "dtype_cast_58",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_58",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wv",
+      "name": "permute_68",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_68",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wv",
+      "name": "alias_default_178",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_175",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_178",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wv",
+      "name": "einsum_default_44",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_156",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_157",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_44",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_158",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_156",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "convert_element_type_152",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_152",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_159",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_159",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_as_complex_12",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_157",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "convert_element_type_153",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_153",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_160",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_160",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_as_complex_13",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_161",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_161",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "alias_default_179",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_179",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "mul_44",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_44",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_as_real_12",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_162",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_179",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "mul_45",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_45",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_as_real_13",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_163",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_162",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "convert_element_type_154",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_163",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "convert_element_type_155",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_155",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "unsqueeze_12",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "expand_12",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "clone_12",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_164",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_158",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "unsqueeze_13",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "expand_13",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "clone_13",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_165",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_154",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "permute_69",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_164",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "permute_70",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_165",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "permute_71",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_69",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "alias_default_180",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_70",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "alias_default_181",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_71",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "alias_default_182",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_180",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_181",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_182",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_6",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.sdpa",
+      "name": "getitem_54",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.sdpa",
+      "name": "getitem_55",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_6",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.sdpa",
+      "name": "getitem_60",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_6",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.sdpa",
+      "name": "getitem_61",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_54",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.sdpa",
+      "name": "alias_default_183",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_183",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "permute_72",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_72",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_166",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_59",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "dtype_cast_59",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_59",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "permute_73",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_166",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "alias_default_184",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_73",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "alias_default_185",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_184",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_185",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "einsum_default_45",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_45",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6",
+      "name": "add_31",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_64",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "dtype_cast_60",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6",
+      "name": "alias_default_186",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_186",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "convert_element_type_158",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_158",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "alias_default_188",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_188",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "pow_14",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "mean_13",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "add_32",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_32",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "rsqrt_13",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "alias_default_189",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_188",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_189",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "mul_46",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_60",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "alias_default_187",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_46",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_187",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "mul_47",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_47",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "convert_element_type_159",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_60",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "dtype_cast_61",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_61",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "permute_74",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_159",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "alias_default_190",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_74",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "alias_default_191",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_190",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_191",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "einsum_default_46",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_46",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "alias_default_192",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_192",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "convert_element_type_162",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_162",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "alias_default_193",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_193",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "neg_6",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "exp_6",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "add_33",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_193",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "div_6",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "convert_element_type_163",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_62",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "dtype_cast_62",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_62",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "permute_75",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_75",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "alias_default_195",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_190",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_195",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "einsum_default_47",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_163",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "alias_default_194",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_47",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "alias_default_196",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_194",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_196",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "mul_48",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_61",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "dtype_cast_63",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_63",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "permute_76",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "alias_default_197",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_76",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "alias_default_198",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_197",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_198",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "einsum_default_48",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_186",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_48",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6",
+      "name": "add_34",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_72",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "dtype_cast_64",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_34",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6",
+      "name": "alias_default_199",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_199",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "convert_element_type_168",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_168",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "alias_default_201",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_201",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "pow_15",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "mean_14",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "add_35",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_35",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "rsqrt_14",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "alias_default_202",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_201",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_202",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "mul_49",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_64",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "alias_default_200",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_200",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "mul_50",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_50",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "convert_element_type_169",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_65",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wq",
+      "name": "dtype_cast_65",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_65",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wq",
+      "name": "permute_77",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_169",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "alias_default_203",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_77",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wq",
+      "name": "alias_default_204",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_203",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_204",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wq",
+      "name": "einsum_default_49",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_66",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wk",
+      "name": "dtype_cast_66",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_66",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wk",
+      "name": "permute_78",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_78",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wk",
+      "name": "alias_default_205",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_203",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_205",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wk",
+      "name": "einsum_default_50",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_67",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wv",
+      "name": "dtype_cast_67",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_67",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wv",
+      "name": "permute_79",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_79",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wv",
+      "name": "alias_default_206",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_203",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_206",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wv",
+      "name": "einsum_default_51",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_49",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_181",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_50",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_182",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_51",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_183",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_181",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "convert_element_type_176",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_176",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_184",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_184",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_as_complex_14",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_182",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "convert_element_type_177",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_177",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_185",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_185",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_as_complex_15",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_186",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_186",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "alias_default_207",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_207",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "mul_51",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_51",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_as_real_14",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_187",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_207",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "mul_52",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_as_real_15",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_188",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_187",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "convert_element_type_178",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_188",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "convert_element_type_179",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_179",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "unsqueeze_14",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "expand_14",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "clone_14",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_189",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_183",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "unsqueeze_15",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "expand_15",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "clone_15",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_190",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_178",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "permute_80",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_189",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "permute_81",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_190",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "permute_82",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_80",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "alias_default_208",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_81",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "alias_default_209",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_82",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "alias_default_210",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_208",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_209",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_210",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_7",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.sdpa",
+      "name": "getitem_63",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.sdpa",
+      "name": "getitem_64",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_7",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.sdpa",
+      "name": "getitem_69",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_7",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.sdpa",
+      "name": "getitem_70",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_63",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.sdpa",
+      "name": "alias_default_211",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_211",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "permute_83",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_83",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_191",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_68",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "dtype_cast_68",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_68",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "permute_84",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_191",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "alias_default_212",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_84",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "alias_default_213",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_212",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_213",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "einsum_default_52",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_199",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_52",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7",
+      "name": "add_36",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_73",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "dtype_cast_69",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_36",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7",
+      "name": "alias_default_214",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_214",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "convert_element_type_182",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_182",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "alias_default_216",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_216",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "pow_16",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "mean_15",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "add_37",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_37",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "rsqrt_15",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "alias_default_217",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_216",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_217",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "mul_53",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_69",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "alias_default_215",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_53",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_215",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "mul_54",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_54",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "convert_element_type_183",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_69",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "dtype_cast_70",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_70",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "permute_85",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_183",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "alias_default_218",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_85",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "alias_default_219",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_218",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_219",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "einsum_default_53",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "alias_default_220",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_220",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "convert_element_type_186",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_186",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "alias_default_221",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_221",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "neg_7",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "exp_7",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "add_38",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_221",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "div_7",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "convert_element_type_187",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_71",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "dtype_cast_71",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_71",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "permute_86",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_86",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "alias_default_223",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_218",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_223",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "einsum_default_54",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_187",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "alias_default_222",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "alias_default_224",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_222",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_224",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "mul_55",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_70",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "dtype_cast_72",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_72",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "permute_87",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_55",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "alias_default_225",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_87",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "alias_default_226",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_225",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_226",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "einsum_default_55",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_214",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_55",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7",
+      "name": "add_39",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_81",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "dtype_cast_73",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_39",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7",
+      "name": "alias_default_227",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_227",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "convert_element_type_192",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_192",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "alias_default_229",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_229",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "pow_17",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "mean_16",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "add_40",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_40",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "rsqrt_16",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "alias_default_230",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_229",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_230",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "mul_56",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_73",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "alias_default_228",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_56",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_228",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "mul_57",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_57",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "convert_element_type_193",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_74",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wq",
+      "name": "dtype_cast_74",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_74",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wq",
+      "name": "permute_88",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_193",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "alias_default_231",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_88",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wq",
+      "name": "alias_default_232",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_231",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_232",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wq",
+      "name": "einsum_default_56",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_75",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wk",
+      "name": "dtype_cast_75",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_75",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wk",
+      "name": "permute_89",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_89",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wk",
+      "name": "alias_default_233",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_231",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_233",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wk",
+      "name": "einsum_default_57",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_76",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wv",
+      "name": "dtype_cast_76",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_76",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wv",
+      "name": "permute_90",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_90",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wv",
+      "name": "alias_default_234",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_231",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_234",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wv",
+      "name": "einsum_default_58",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_206",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_207",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_208",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_206",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "convert_element_type_200",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_200",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_209",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_209",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_as_complex_16",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_207",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "convert_element_type_201",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_201",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_210",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_210",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_as_complex_17",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_211",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_211",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "alias_default_235",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_235",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "mul_58",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_as_real_16",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_212",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_235",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "mul_59",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_as_real_17",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_213",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_212",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "convert_element_type_202",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_213",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "convert_element_type_203",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_203",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "unsqueeze_16",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "expand_16",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "clone_16",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_214",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_208",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "unsqueeze_17",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "expand_17",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "clone_17",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_215",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_202",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "permute_91",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_214",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "permute_92",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_215",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "permute_93",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_91",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "alias_default_236",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_92",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "alias_default_237",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_93",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "alias_default_238",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_236",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_237",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_238",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_8",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_8",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.sdpa",
+      "name": "getitem_72",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_8",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.sdpa",
+      "name": "getitem_73",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_8",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.sdpa",
+      "name": "getitem_78",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_8",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.sdpa",
+      "name": "getitem_79",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_72",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.sdpa",
+      "name": "alias_default_239",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_239",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "permute_94",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_94",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_216",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_77",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "dtype_cast_77",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_77",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "permute_95",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_216",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "alias_default_240",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_95",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "alias_default_241",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_240",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_241",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "einsum_default_59",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_227",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8",
+      "name": "add_41",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_82",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "dtype_cast_78",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_41",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8",
+      "name": "alias_default_242",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_242",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "convert_element_type_206",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_206",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "alias_default_244",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_244",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "pow_18",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "mean_17",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "add_42",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_42",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "rsqrt_17",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "alias_default_245",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_244",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_245",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "mul_60",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_78",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "alias_default_243",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_60",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_243",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "mul_61",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_61",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "convert_element_type_207",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_78",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "dtype_cast_79",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_79",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "permute_96",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_207",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "alias_default_246",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_96",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "alias_default_247",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_246",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_247",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "einsum_default_60",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "alias_default_248",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_248",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "convert_element_type_210",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_210",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "alias_default_249",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_249",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "neg_8",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "exp_8",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "add_43",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_249",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "div_8",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "convert_element_type_211",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_80",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "dtype_cast_80",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_80",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "permute_97",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_97",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "alias_default_251",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_246",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_251",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "einsum_default_61",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_211",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "alias_default_250",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "alias_default_252",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_250",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_252",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "mul_62",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_79",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "dtype_cast_81",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_81",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "permute_98",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_62",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "alias_default_253",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_98",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "alias_default_254",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_253",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_254",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "einsum_default_62",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_242",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_62",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8",
+      "name": "add_44",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_90",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "dtype_cast_82",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_44",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8",
+      "name": "alias_default_255",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_255",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "convert_element_type_216",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_216",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "alias_default_257",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_257",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "pow_19",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "mean_18",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "add_45",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_45",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "rsqrt_18",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "alias_default_258",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_257",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_258",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "mul_63",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_82",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "alias_default_256",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_63",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_256",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "mul_64",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_64",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "convert_element_type_217",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_83",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wq",
+      "name": "dtype_cast_83",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_83",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wq",
+      "name": "permute_99",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_217",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "alias_default_259",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_99",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wq",
+      "name": "alias_default_260",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_259",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_260",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wq",
+      "name": "einsum_default_63",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_84",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wk",
+      "name": "dtype_cast_84",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_84",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wk",
+      "name": "permute_100",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_100",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wk",
+      "name": "alias_default_261",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_259",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_261",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wk",
+      "name": "einsum_default_64",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_85",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wv",
+      "name": "dtype_cast_85",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_85",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wv",
+      "name": "permute_101",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_101",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wv",
+      "name": "alias_default_262",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_259",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_262",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wv",
+      "name": "einsum_default_65",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_231",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_64",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_232",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_65",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_233",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_231",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "convert_element_type_224",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_224",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_234",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_234",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_as_complex_18",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_232",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "convert_element_type_225",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_225",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_235",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_235",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_as_complex_19",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_236",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_236",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "alias_default_263",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_263",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "mul_65",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_65",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_as_real_18",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_237",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_263",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "mul_66",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_66",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_as_real_19",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_238",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_237",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "convert_element_type_226",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_238",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "convert_element_type_227",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_227",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "unsqueeze_18",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "expand_18",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "clone_18",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_239",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_233",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "unsqueeze_19",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "expand_19",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "clone_19",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_240",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_226",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "permute_102",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_239",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "permute_103",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_240",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "permute_104",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_102",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "alias_default_264",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_103",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "alias_default_265",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_104",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "alias_default_266",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_264",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_265",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_266",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_9",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.sdpa",
+      "name": "getitem_81",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.sdpa",
+      "name": "getitem_82",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_9",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.sdpa",
+      "name": "getitem_87",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_9",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.sdpa",
+      "name": "getitem_88",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_81",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.sdpa",
+      "name": "alias_default_267",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_267",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "permute_105",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_105",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_241",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_86",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "dtype_cast_86",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_86",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "permute_106",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_241",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "alias_default_268",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_106",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "alias_default_269",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_268",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_269",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "einsum_default_66",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_255",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_66",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9",
+      "name": "add_46",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_91",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "dtype_cast_87",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_46",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9",
+      "name": "alias_default_270",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_270",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "convert_element_type_230",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_230",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "alias_default_272",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_272",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "pow_20",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "mean_19",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "add_47",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_47",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "rsqrt_19",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "alias_default_273",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_272",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_273",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "mul_67",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_87",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "alias_default_271",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_67",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_271",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "mul_68",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_68",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "convert_element_type_231",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_87",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "dtype_cast_88",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_88",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "permute_107",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_231",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "alias_default_274",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_107",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "alias_default_275",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_274",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_275",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "einsum_default_67",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_67",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "alias_default_276",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_276",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "convert_element_type_234",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_234",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "alias_default_277",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_277",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "neg_9",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "exp_9",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "add_48",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_277",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "div_9",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "convert_element_type_235",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_89",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "dtype_cast_89",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_89",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "permute_108",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_108",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "alias_default_279",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_274",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_279",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "einsum_default_68",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_235",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "alias_default_278",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_68",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "alias_default_280",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_278",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_280",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "mul_69",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_88",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "dtype_cast_90",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_90",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "permute_109",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_69",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "alias_default_281",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_109",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "alias_default_282",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_281",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_282",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "einsum_default_69",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_270",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_69",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9",
+      "name": "add_49",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_99",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "dtype_cast_91",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9",
+      "name": "alias_default_283",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_283",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "convert_element_type_240",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_240",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "alias_default_285",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_285",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "pow_21",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "mean_20",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "add_50",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_50",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "rsqrt_20",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "alias_default_286",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_285",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_286",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "mul_70",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_91",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "alias_default_284",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_70",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_284",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "mul_71",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_71",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "convert_element_type_241",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_92",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wq",
+      "name": "dtype_cast_92",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_92",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wq",
+      "name": "permute_110",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_241",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "alias_default_287",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_110",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wq",
+      "name": "alias_default_288",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_287",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_288",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wq",
+      "name": "einsum_default_70",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_93",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wk",
+      "name": "dtype_cast_93",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_93",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wk",
+      "name": "permute_111",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_111",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wk",
+      "name": "alias_default_289",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_287",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_289",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wk",
+      "name": "einsum_default_71",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_94",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wv",
+      "name": "dtype_cast_94",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_94",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wv",
+      "name": "permute_112",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_112",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wv",
+      "name": "alias_default_290",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_287",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_290",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wv",
+      "name": "einsum_default_72",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_70",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_256",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_71",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_257",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_72",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_258",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_256",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "convert_element_type_248",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_248",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_259",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_259",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_as_complex_20",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_257",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "convert_element_type_249",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_249",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_260",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_260",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_as_complex_21",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_261",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_261",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "alias_default_291",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_291",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "mul_72",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_72",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_as_real_20",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_262",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_291",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "mul_73",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_73",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_as_real_21",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_263",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_262",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "convert_element_type_250",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_263",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "convert_element_type_251",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_251",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "unsqueeze_20",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "expand_20",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "clone_20",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_264",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_258",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "unsqueeze_21",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "expand_21",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "clone_21",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_265",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_250",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "permute_113",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_264",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "permute_114",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_265",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "permute_115",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_113",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "alias_default_292",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_114",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "alias_default_293",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "alias_default_294",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_292",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_293",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_294",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_10",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.sdpa",
+      "name": "getitem_90",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.sdpa",
+      "name": "getitem_91",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_10",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.sdpa",
+      "name": "getitem_96",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_10",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.sdpa",
+      "name": "getitem_97",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_90",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.sdpa",
+      "name": "alias_default_295",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_295",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "permute_116",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_116",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_266",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_95",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "dtype_cast_95",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_95",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "permute_117",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_266",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "alias_default_296",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_117",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "alias_default_297",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_296",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_297",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "einsum_default_73",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_283",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_73",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10",
+      "name": "add_51",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_100",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "dtype_cast_96",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_51",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10",
+      "name": "alias_default_298",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_298",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "convert_element_type_254",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_254",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "alias_default_300",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_300",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "pow_22",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_22",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "mean_21",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "add_52",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_52",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "rsqrt_21",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "alias_default_301",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_300",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_301",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "mul_74",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_96",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "alias_default_299",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_74",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_299",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "mul_75",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_75",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "convert_element_type_255",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_96",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "dtype_cast_97",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_97",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "permute_118",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_255",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "alias_default_302",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_118",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "alias_default_303",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_302",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_303",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "einsum_default_74",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_74",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "alias_default_304",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_304",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "convert_element_type_258",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_258",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "alias_default_305",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_305",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "neg_10",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "exp_10",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "add_53",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_305",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "div_10",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "convert_element_type_259",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_98",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "dtype_cast_98",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_98",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "permute_119",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_119",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "alias_default_307",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_302",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_307",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "einsum_default_75",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_259",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "alias_default_306",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_75",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "alias_default_308",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_306",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_308",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "mul_76",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_97",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "dtype_cast_99",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_99",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "permute_120",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_76",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "alias_default_309",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_120",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "alias_default_310",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_309",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_310",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "einsum_default_76",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_298",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_76",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10",
+      "name": "add_54",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_108",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "dtype_cast_100",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_54",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10",
+      "name": "alias_default_311",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_311",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "convert_element_type_264",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_264",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "alias_default_313",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_313",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "pow_23",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_23",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "mean_22",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_22",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "add_55",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_55",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "rsqrt_22",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_22",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "alias_default_314",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_313",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_314",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "mul_77",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_100",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "alias_default_312",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_77",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_312",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "mul_78",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_78",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "convert_element_type_265",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_101",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wq",
+      "name": "dtype_cast_101",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_101",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wq",
+      "name": "permute_121",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_265",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "alias_default_315",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_121",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wq",
+      "name": "alias_default_316",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_315",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_316",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wq",
+      "name": "einsum_default_77",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_102",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wk",
+      "name": "dtype_cast_102",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_102",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wk",
+      "name": "permute_122",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_122",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wk",
+      "name": "alias_default_317",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_315",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_317",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wk",
+      "name": "einsum_default_78",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_103",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wv",
+      "name": "dtype_cast_103",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_103",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wv",
+      "name": "permute_123",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_123",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wv",
+      "name": "alias_default_318",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_315",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_318",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wv",
+      "name": "einsum_default_79",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_77",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_281",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_78",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_282",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_79",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_283",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_281",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "convert_element_type_272",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_272",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_284",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_284",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_as_complex_22",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_282",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "convert_element_type_273",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_273",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_285",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_285",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_as_complex_23",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_286",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_286",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "alias_default_319",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_319",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "mul_79",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_79",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_as_real_22",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_287",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_319",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "mul_80",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_80",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_as_real_23",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_288",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_287",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "convert_element_type_274",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_288",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "convert_element_type_275",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_275",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "unsqueeze_22",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "expand_22",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "clone_22",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_289",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_283",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "unsqueeze_23",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "expand_23",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "clone_23",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_290",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_274",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "permute_124",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_289",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "permute_125",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_290",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "permute_126",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_124",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "alias_default_320",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_125",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "alias_default_321",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_126",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "alias_default_322",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_320",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_321",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_322",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_11",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.sdpa",
+      "name": "getitem_99",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.sdpa",
+      "name": "getitem_100",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_11",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.sdpa",
+      "name": "getitem_105",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_11",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.sdpa",
+      "name": "getitem_106",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_99",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.sdpa",
+      "name": "alias_default_323",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_323",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "permute_127",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_127",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_291",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_104",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "dtype_cast_104",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_104",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "permute_128",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_291",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "alias_default_324",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_128",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "alias_default_325",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_324",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_325",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "einsum_default_80",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_311",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_80",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11",
+      "name": "add_56",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_109",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "dtype_cast_105",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_56",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11",
+      "name": "alias_default_326",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_326",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "convert_element_type_278",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_278",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "alias_default_328",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_328",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "pow_24",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "mean_23",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_23",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "add_57",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_57",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "rsqrt_23",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_23",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "alias_default_329",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_328",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_329",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "mul_81",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_105",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "alias_default_327",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_81",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_327",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "mul_82",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_82",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "convert_element_type_279",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_105",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "dtype_cast_106",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_106",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "permute_129",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_279",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "alias_default_330",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_129",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "alias_default_331",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_330",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_331",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "einsum_default_81",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_81",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "alias_default_332",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_332",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "convert_element_type_282",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_282",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "alias_default_333",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_333",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "neg_11",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "exp_11",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "add_58",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_333",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "div_11",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "convert_element_type_283",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_107",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "dtype_cast_107",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_107",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "permute_130",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_130",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "alias_default_335",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_330",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_335",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "einsum_default_82",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_283",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "alias_default_334",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_82",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "alias_default_336",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_334",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_336",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "mul_83",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_106",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "dtype_cast_108",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_108",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "permute_131",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_83",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "alias_default_337",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_131",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "alias_default_338",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_337",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_338",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "einsum_default_83",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_326",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_83",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11",
+      "name": "add_59",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_117",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "dtype_cast_109",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11",
+      "name": "alias_default_339",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_339",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "convert_element_type_288",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_288",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "alias_default_341",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_341",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "pow_25",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "mean_24",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "add_60",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_60",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "rsqrt_24",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "alias_default_342",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_341",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_342",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "mul_84",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_109",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "alias_default_340",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_84",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_340",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "mul_85",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_85",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "convert_element_type_289",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_110",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wq",
+      "name": "dtype_cast_110",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_110",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wq",
+      "name": "permute_132",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_289",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "alias_default_343",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_132",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wq",
+      "name": "alias_default_344",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_343",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_344",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wq",
+      "name": "einsum_default_84",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_111",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wk",
+      "name": "dtype_cast_111",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_111",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wk",
+      "name": "permute_133",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_133",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wk",
+      "name": "alias_default_345",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_343",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_345",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wk",
+      "name": "einsum_default_85",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_112",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wv",
+      "name": "dtype_cast_112",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_112",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wv",
+      "name": "permute_134",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_134",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wv",
+      "name": "alias_default_346",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_343",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_346",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wv",
+      "name": "einsum_default_86",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_84",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_306",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_85",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_307",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_86",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_308",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_306",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "convert_element_type_296",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_296",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_309",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_309",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_as_complex_24",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_307",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "convert_element_type_297",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_297",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_310",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_310",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_as_complex_25",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_311",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_311",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "alias_default_347",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_347",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "mul_86",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_86",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_as_real_24",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_312",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_347",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "mul_87",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_87",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_as_real_25",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_313",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_312",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "convert_element_type_298",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_313",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "convert_element_type_299",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_299",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "unsqueeze_24",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "expand_24",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "clone_24",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_314",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_308",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "unsqueeze_25",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "expand_25",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "clone_25",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_315",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_298",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "permute_135",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_314",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "permute_136",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_315",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "permute_137",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_135",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "alias_default_348",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_136",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "alias_default_349",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_137",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "alias_default_350",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_348",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_349",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_350",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_12",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.sdpa",
+      "name": "getitem_108",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.sdpa",
+      "name": "getitem_109",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_12",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.sdpa",
+      "name": "getitem_114",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_12",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.sdpa",
+      "name": "getitem_115",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_108",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.sdpa",
+      "name": "alias_default_351",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_351",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "permute_138",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_138",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_316",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_113",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "dtype_cast_113",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_113",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "permute_139",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_316",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "alias_default_352",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_139",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "alias_default_353",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_352",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_353",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "einsum_default_87",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_339",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_87",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12",
+      "name": "add_61",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_118",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "dtype_cast_114",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_61",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12",
+      "name": "alias_default_354",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_354",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "convert_element_type_302",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_302",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "alias_default_356",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_356",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "pow_26",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "mean_25",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "add_62",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_62",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "rsqrt_25",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "alias_default_357",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_356",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_357",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "mul_88",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_114",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "alias_default_355",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_88",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_355",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "mul_89",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_89",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "convert_element_type_303",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_114",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "dtype_cast_115",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_115",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "permute_140",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_303",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "alias_default_358",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_140",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "alias_default_359",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_358",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_359",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "einsum_default_88",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_88",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "alias_default_360",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_360",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "convert_element_type_306",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_306",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "alias_default_361",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_361",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "neg_12",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "exp_12",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "add_63",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_361",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "div_12",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "convert_element_type_307",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_116",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "dtype_cast_116",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_116",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "permute_141",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_141",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "alias_default_363",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_358",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_363",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "einsum_default_89",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_307",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "alias_default_362",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_89",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "alias_default_364",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_362",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_364",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "mul_90",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "dtype_cast_117",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_117",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "permute_142",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_90",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "alias_default_365",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_142",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "alias_default_366",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_365",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_366",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "einsum_default_90",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_354",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_90",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12",
+      "name": "add_64",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_126",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "dtype_cast_118",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_64",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12",
+      "name": "alias_default_367",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_367",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "convert_element_type_312",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_312",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "alias_default_369",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_369",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "pow_27",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "mean_26",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "add_65",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_65",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "rsqrt_26",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "alias_default_370",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_369",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_370",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "mul_91",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_118",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "alias_default_368",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_91",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_368",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "mul_92",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_92",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "convert_element_type_313",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_119",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wq",
+      "name": "dtype_cast_119",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_119",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wq",
+      "name": "permute_143",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_313",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "alias_default_371",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_143",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wq",
+      "name": "alias_default_372",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_371",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_372",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wq",
+      "name": "einsum_default_91",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_120",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wk",
+      "name": "dtype_cast_120",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_120",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wk",
+      "name": "permute_144",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_144",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wk",
+      "name": "alias_default_373",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_371",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_373",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wk",
+      "name": "einsum_default_92",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_121",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wv",
+      "name": "dtype_cast_121",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_121",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wv",
+      "name": "permute_145",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_145",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wv",
+      "name": "alias_default_374",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_371",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_374",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wv",
+      "name": "einsum_default_93",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_91",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_331",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_92",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_332",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_93",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_333",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_331",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "convert_element_type_320",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_320",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_334",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_334",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_as_complex_26",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_332",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "convert_element_type_321",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_321",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_335",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_335",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_as_complex_27",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_336",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_336",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "alias_default_375",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_375",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "mul_93",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_93",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_as_real_26",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_337",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_375",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "mul_94",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_94",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_as_real_27",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_338",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_337",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "convert_element_type_322",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_338",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "convert_element_type_323",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_323",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "unsqueeze_26",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "expand_26",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "clone_26",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_339",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_333",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "unsqueeze_27",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "expand_27",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "clone_27",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_340",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_322",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "permute_146",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_339",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "permute_147",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_340",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "permute_148",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_146",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "alias_default_376",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_147",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "alias_default_377",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_148",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "alias_default_378",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_376",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_377",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_378",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_13",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.sdpa",
+      "name": "getitem_117",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.sdpa",
+      "name": "getitem_118",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_13",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.sdpa",
+      "name": "getitem_123",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_13",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.sdpa",
+      "name": "getitem_124",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_117",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.sdpa",
+      "name": "alias_default_379",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_379",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "permute_149",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_149",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_341",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_122",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "dtype_cast_122",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_122",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "permute_150",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_341",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "alias_default_380",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_150",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "alias_default_381",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_380",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_381",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "einsum_default_94",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_367",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_94",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13",
+      "name": "add_66",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_127",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "dtype_cast_123",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_66",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13",
+      "name": "alias_default_382",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_382",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "convert_element_type_326",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_326",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "alias_default_384",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_384",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "pow_28",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_28",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "mean_27",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "add_67",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_67",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "rsqrt_27",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "alias_default_385",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_384",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_385",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "mul_95",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_123",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "alias_default_383",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_95",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_383",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "mul_96",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_96",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "convert_element_type_327",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_123",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "dtype_cast_124",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_124",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "permute_151",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_327",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "alias_default_386",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_151",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "alias_default_387",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_386",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_387",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "einsum_default_95",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_95",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "alias_default_388",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_388",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "convert_element_type_330",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_330",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "alias_default_389",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_389",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "neg_13",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "exp_13",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "add_68",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_389",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_68",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "div_13",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "convert_element_type_331",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_125",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "dtype_cast_125",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_125",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "permute_152",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_152",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "alias_default_391",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_386",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_391",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "einsum_default_96",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_331",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "alias_default_390",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_96",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "alias_default_392",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_390",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_392",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "mul_97",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_124",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "dtype_cast_126",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_126",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "permute_153",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_97",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "alias_default_393",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_153",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "alias_default_394",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_393",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_394",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "einsum_default_97",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_382",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_97",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13",
+      "name": "add_69",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_135",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "dtype_cast_127",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_69",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13",
+      "name": "alias_default_395",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_395",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "convert_element_type_336",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_336",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "alias_default_397",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_397",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "pow_29",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "mean_28",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_28",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "add_70",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_70",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "rsqrt_28",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_28",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "alias_default_398",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_397",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_398",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "mul_98",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_127",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "alias_default_396",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_98",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_396",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "mul_99",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_99",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "convert_element_type_337",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_128",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wq",
+      "name": "dtype_cast_128",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_128",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wq",
+      "name": "permute_154",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_337",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "alias_default_399",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_154",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wq",
+      "name": "alias_default_400",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_399",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_400",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wq",
+      "name": "einsum_default_98",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_129",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wk",
+      "name": "dtype_cast_129",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_129",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wk",
+      "name": "permute_155",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_155",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wk",
+      "name": "alias_default_401",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_399",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_401",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wk",
+      "name": "einsum_default_99",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_130",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wv",
+      "name": "dtype_cast_130",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_130",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wv",
+      "name": "permute_156",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_156",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wv",
+      "name": "alias_default_402",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_399",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_402",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wv",
+      "name": "einsum_default_100",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_98",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_356",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_99",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_357",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_100",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_358",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_356",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "convert_element_type_344",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_344",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_359",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_359",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_as_complex_28",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_357",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "convert_element_type_345",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_345",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_360",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_360",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_as_complex_29",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_361",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_361",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "alias_default_403",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_403",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "mul_100",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_100",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_as_real_28",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_362",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_403",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "mul_101",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_101",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_as_real_29",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_363",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_362",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "convert_element_type_346",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_363",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "convert_element_type_347",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_347",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "unsqueeze_28",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "expand_28",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "clone_28",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_364",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_358",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "unsqueeze_29",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "expand_29",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "clone_29",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_365",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_346",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "permute_157",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_364",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "permute_158",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_365",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "permute_159",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_157",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "alias_default_404",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_158",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "alias_default_405",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_159",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "alias_default_406",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_404",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_405",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_406",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_14",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.sdpa",
+      "name": "getitem_126",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.sdpa",
+      "name": "getitem_127",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_14",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.sdpa",
+      "name": "getitem_132",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_14",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.sdpa",
+      "name": "getitem_133",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_126",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.sdpa",
+      "name": "alias_default_407",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_407",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "permute_160",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_160",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_366",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_131",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "dtype_cast_131",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_131",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "permute_161",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_366",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "alias_default_408",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_161",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "alias_default_409",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_408",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_409",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "einsum_default_101",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_395",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_101",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14",
+      "name": "add_71",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_136",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "dtype_cast_132",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_71",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14",
+      "name": "alias_default_410",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_410",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "convert_element_type_350",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_350",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "alias_default_412",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_412",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "pow_30",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_30",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "mean_29",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "add_72",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_72",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "rsqrt_29",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "alias_default_413",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_412",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_413",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "mul_102",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_132",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "alias_default_411",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_102",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_411",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "mul_103",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_103",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "convert_element_type_351",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_132",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "dtype_cast_133",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_133",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "permute_162",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_351",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "alias_default_414",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_162",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "alias_default_415",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_414",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_415",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "einsum_default_102",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_102",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "alias_default_416",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_416",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "convert_element_type_354",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_354",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "alias_default_417",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_417",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "neg_14",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "exp_14",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "add_73",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_417",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_73",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "div_14",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "convert_element_type_355",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_134",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "dtype_cast_134",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_134",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "permute_163",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_163",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "alias_default_419",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_414",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_419",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "einsum_default_103",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_355",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "alias_default_418",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_103",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "alias_default_420",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_418",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_420",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "mul_104",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_133",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "dtype_cast_135",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_135",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "permute_164",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_104",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "alias_default_421",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_164",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "alias_default_422",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_421",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_422",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "einsum_default_104",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_410",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_104",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14",
+      "name": "add_74",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_144",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "dtype_cast_136",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_74",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14",
+      "name": "alias_default_423",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_423",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "convert_element_type_360",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_360",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "alias_default_425",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_425",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "pow_31",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "mean_30",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_30",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "add_75",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_75",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "rsqrt_30",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_30",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "alias_default_426",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_425",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_426",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "mul_105",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_136",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "alias_default_424",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_105",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_424",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "mul_106",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_106",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "convert_element_type_361",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_137",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wq",
+      "name": "dtype_cast_137",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_137",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wq",
+      "name": "permute_165",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_361",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "alias_default_427",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_165",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wq",
+      "name": "alias_default_428",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_427",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_428",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wq",
+      "name": "einsum_default_105",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_138",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wk",
+      "name": "dtype_cast_138",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_138",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wk",
+      "name": "permute_166",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_166",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wk",
+      "name": "alias_default_429",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_427",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_429",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wk",
+      "name": "einsum_default_106",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_139",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wv",
+      "name": "dtype_cast_139",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_139",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wv",
+      "name": "permute_167",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_167",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wv",
+      "name": "alias_default_430",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_427",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_430",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wv",
+      "name": "einsum_default_107",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_105",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_381",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_106",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_382",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_107",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_383",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_381",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "convert_element_type_368",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_368",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_384",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_384",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_as_complex_30",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_382",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "convert_element_type_369",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_369",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_385",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_385",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_as_complex_31",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_386",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_386",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "alias_default_431",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_431",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "mul_107",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_107",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_as_real_30",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_387",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_431",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "mul_108",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_108",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_as_real_31",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_388",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_387",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "convert_element_type_370",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_388",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "convert_element_type_371",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_371",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "unsqueeze_30",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "expand_30",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "clone_30",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_389",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_383",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "unsqueeze_31",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "expand_31",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "clone_31",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_390",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_370",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "permute_168",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_389",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "permute_169",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_390",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "permute_170",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_168",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "alias_default_432",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_169",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "alias_default_433",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_170",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "alias_default_434",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_432",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_433",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_434",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_15",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.sdpa",
+      "name": "getitem_135",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.sdpa",
+      "name": "getitem_136",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_15",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.sdpa",
+      "name": "getitem_141",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_15",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.sdpa",
+      "name": "getitem_142",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_135",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.sdpa",
+      "name": "alias_default_435",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_435",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "permute_171",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_171",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_391",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_140",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "dtype_cast_140",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_140",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "permute_172",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_391",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "alias_default_436",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_172",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "alias_default_437",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_436",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_437",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "einsum_default_108",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_423",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_108",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15",
+      "name": "add_76",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_145",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "dtype_cast_141",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_76",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15",
+      "name": "alias_default_438",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_438",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "convert_element_type_374",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_374",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "alias_default_440",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_440",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "pow_32",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_32",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "mean_31",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "add_77",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_77",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "rsqrt_31",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "alias_default_441",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_440",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_441",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "mul_109",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_141",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "alias_default_439",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_109",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_439",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "mul_110",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_110",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "convert_element_type_375",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_141",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "dtype_cast_142",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_142",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "permute_173",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_375",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "alias_default_442",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_173",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "alias_default_443",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_442",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_443",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "einsum_default_109",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_109",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "alias_default_444",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_444",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "convert_element_type_378",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_378",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "alias_default_445",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_445",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "neg_15",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "exp_15",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "add_78",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_445",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_78",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "div_15",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "convert_element_type_379",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_143",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "dtype_cast_143",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_143",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "permute_174",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_174",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "alias_default_447",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_442",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_447",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "einsum_default_110",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_379",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "alias_default_446",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_110",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "alias_default_448",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_446",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_448",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "mul_111",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_142",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "dtype_cast_144",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_144",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "permute_175",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_111",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "alias_default_449",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_175",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "alias_default_450",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_449",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_450",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "einsum_default_111",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_438",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_111",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15",
+      "name": "add_79",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_153",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "dtype_cast_145",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_79",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15",
+      "name": "alias_default_451",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_451",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "convert_element_type_384",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_384",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "alias_default_453",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_453",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "pow_33",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_33",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "mean_32",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_32",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "add_80",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_80",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "rsqrt_32",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_32",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "alias_default_454",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_453",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_454",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "mul_112",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_145",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "alias_default_452",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_112",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_452",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "mul_113",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_113",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "convert_element_type_385",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_146",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wq",
+      "name": "dtype_cast_146",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_146",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wq",
+      "name": "permute_176",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_385",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "alias_default_455",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_176",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wq",
+      "name": "alias_default_456",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_455",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_456",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wq",
+      "name": "einsum_default_112",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_147",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wk",
+      "name": "dtype_cast_147",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_147",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wk",
+      "name": "permute_177",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_177",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wk",
+      "name": "alias_default_457",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_455",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_457",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wk",
+      "name": "einsum_default_113",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_148",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wv",
+      "name": "dtype_cast_148",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_148",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wv",
+      "name": "permute_178",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_178",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wv",
+      "name": "alias_default_458",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_455",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_458",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wv",
+      "name": "einsum_default_114",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_112",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_406",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_113",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_407",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_114",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_408",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_406",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "convert_element_type_392",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_392",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_409",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_409",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_as_complex_32",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_407",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "convert_element_type_393",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_393",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_410",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_410",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_as_complex_33",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_411",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_411",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "alias_default_459",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_459",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "mul_114",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_114",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_as_real_32",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_412",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_459",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "mul_115",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_115",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_as_real_33",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_413",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_412",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "convert_element_type_394",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_413",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "convert_element_type_395",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_395",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "unsqueeze_32",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "expand_32",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "clone_32",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_414",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_408",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "unsqueeze_33",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "expand_33",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "clone_33",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_415",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_394",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "permute_179",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_414",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "permute_180",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_415",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "permute_181",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_179",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "alias_default_460",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_180",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "alias_default_461",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_181",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "alias_default_462",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_460",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_461",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_462",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_16",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.sdpa",
+      "name": "getitem_144",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.sdpa",
+      "name": "getitem_145",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_16",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.sdpa",
+      "name": "getitem_150",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_16",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.sdpa",
+      "name": "getitem_151",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_144",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.sdpa",
+      "name": "alias_default_463",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_463",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "permute_182",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_182",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_416",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_149",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "dtype_cast_149",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_149",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "permute_183",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_416",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "alias_default_464",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_183",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "alias_default_465",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_464",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_465",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "einsum_default_115",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_451",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16",
+      "name": "add_81",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_154",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "dtype_cast_150",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_81",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16",
+      "name": "alias_default_466",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_466",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "convert_element_type_398",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_398",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "alias_default_468",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_468",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "pow_34",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_34",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "mean_33",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_33",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "add_82",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_82",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "rsqrt_33",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_33",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "alias_default_469",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_468",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_469",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "mul_116",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_150",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "alias_default_467",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_116",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_467",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "mul_117",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_117",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "convert_element_type_399",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_150",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "dtype_cast_151",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_151",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "permute_184",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_399",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "alias_default_470",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_184",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "alias_default_471",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_470",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_471",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "einsum_default_116",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_116",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "alias_default_472",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_472",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "convert_element_type_402",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_402",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "alias_default_473",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_473",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "neg_16",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "exp_16",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "add_83",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_473",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_83",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "div_16",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "convert_element_type_403",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_152",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "dtype_cast_152",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_152",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "permute_185",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_185",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "alias_default_475",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_470",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_475",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "einsum_default_117",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_403",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "alias_default_474",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_117",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "alias_default_476",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_474",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_476",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "mul_118",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_151",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "dtype_cast_153",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_153",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "permute_186",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_118",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "alias_default_477",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_186",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "alias_default_478",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_477",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_478",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "einsum_default_118",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_466",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_118",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16",
+      "name": "add_84",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_162",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "dtype_cast_154",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_84",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16",
+      "name": "alias_default_479",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_479",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "convert_element_type_408",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_408",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "alias_default_481",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_481",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "pow_35",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_35",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "mean_34",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_34",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "add_85",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_85",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "rsqrt_34",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_34",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "alias_default_482",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_481",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_482",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "mul_119",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_154",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "alias_default_480",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_119",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_480",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "mul_120",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_120",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "convert_element_type_409",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_155",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wq",
+      "name": "dtype_cast_155",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_155",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wq",
+      "name": "permute_187",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_409",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "alias_default_483",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_187",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wq",
+      "name": "alias_default_484",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_483",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_484",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wq",
+      "name": "einsum_default_119",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_156",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wk",
+      "name": "dtype_cast_156",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_156",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wk",
+      "name": "permute_188",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_188",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wk",
+      "name": "alias_default_485",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_483",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_485",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wk",
+      "name": "einsum_default_120",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_157",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wv",
+      "name": "dtype_cast_157",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_157",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wv",
+      "name": "permute_189",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_189",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wv",
+      "name": "alias_default_486",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_483",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_486",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wv",
+      "name": "einsum_default_121",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_119",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_431",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_120",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_432",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_121",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_433",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_431",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "convert_element_type_416",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_416",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_434",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_434",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_as_complex_34",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_432",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "convert_element_type_417",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_417",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_435",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_435",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_as_complex_35",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_436",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_436",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "alias_default_487",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_487",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "mul_121",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_121",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_as_real_34",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_437",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_487",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "mul_122",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_122",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_as_real_35",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_438",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_437",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "convert_element_type_418",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_438",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "convert_element_type_419",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_419",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "unsqueeze_34",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "expand_34",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "clone_34",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_439",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_433",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "unsqueeze_35",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "expand_35",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "clone_35",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_440",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_418",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "permute_190",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_439",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "permute_191",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_440",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "permute_192",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_190",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "alias_default_488",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_191",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "alias_default_489",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_192",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "alias_default_490",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_488",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_489",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_490",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_17",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.sdpa",
+      "name": "getitem_153",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.sdpa",
+      "name": "getitem_154",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_17",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.sdpa",
+      "name": "getitem_159",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_17",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.sdpa",
+      "name": "getitem_160",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_153",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.sdpa",
+      "name": "alias_default_491",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_491",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "permute_193",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_193",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_441",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_158",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "dtype_cast_158",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_158",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "permute_194",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_441",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "alias_default_492",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_194",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "alias_default_493",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_492",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_493",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "einsum_default_122",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_479",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_122",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17",
+      "name": "add_86",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_163",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "dtype_cast_159",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_86",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17",
+      "name": "alias_default_494",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_494",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "convert_element_type_422",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_422",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "alias_default_496",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_496",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "pow_36",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_36",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "mean_35",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_35",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "add_87",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_87",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "rsqrt_35",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_35",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "alias_default_497",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_496",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_497",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "mul_123",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_159",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "alias_default_495",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_123",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_495",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "mul_124",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_124",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "convert_element_type_423",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_159",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "dtype_cast_160",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_160",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "permute_195",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_423",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "alias_default_498",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_195",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "alias_default_499",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_498",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_499",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "einsum_default_123",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_123",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "alias_default_500",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_500",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "convert_element_type_426",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_426",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "alias_default_501",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_501",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "neg_17",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "exp_17",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "add_88",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_501",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_88",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "div_17",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "convert_element_type_427",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_161",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "dtype_cast_161",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_161",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "permute_196",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_196",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "alias_default_503",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_498",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_503",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "einsum_default_124",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_427",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "alias_default_502",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_124",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "alias_default_504",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_502",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_504",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "mul_125",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_160",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "dtype_cast_162",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_162",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "permute_197",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_125",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "alias_default_505",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_197",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "alias_default_506",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_505",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_506",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "einsum_default_125",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_494",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_125",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17",
+      "name": "add_89",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_171",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "dtype_cast_163",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_89",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17",
+      "name": "alias_default_507",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_507",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "convert_element_type_432",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_432",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "alias_default_509",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_509",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "pow_37",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_37",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "mean_36",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_36",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "add_90",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_90",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "rsqrt_36",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_36",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "alias_default_510",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_509",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_510",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "mul_126",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_163",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "alias_default_508",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_126",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_508",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "mul_127",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_127",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "convert_element_type_433",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_164",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wq",
+      "name": "dtype_cast_164",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_164",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wq",
+      "name": "permute_198",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_433",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "alias_default_511",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_198",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wq",
+      "name": "alias_default_512",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_511",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_512",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wq",
+      "name": "einsum_default_126",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_165",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wk",
+      "name": "dtype_cast_165",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_165",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wk",
+      "name": "permute_199",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_199",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wk",
+      "name": "alias_default_513",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_511",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_513",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wk",
+      "name": "einsum_default_127",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_166",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wv",
+      "name": "dtype_cast_166",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_166",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wv",
+      "name": "permute_200",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_200",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wv",
+      "name": "alias_default_514",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_511",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_514",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wv",
+      "name": "einsum_default_128",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_126",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_456",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_127",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_457",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_128",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_458",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_456",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "convert_element_type_440",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_440",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_459",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_459",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_as_complex_36",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_457",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "convert_element_type_441",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_441",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_460",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_460",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_as_complex_37",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_461",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_461",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "alias_default_515",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_36",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_515",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "mul_128",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_128",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_as_real_36",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_36",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_462",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_515",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "mul_129",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_129",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_as_real_37",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_463",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_462",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "convert_element_type_442",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_463",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "convert_element_type_443",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_443",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "unsqueeze_36",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_36",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "expand_36",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_36",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "clone_36",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_36",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_464",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_458",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "unsqueeze_37",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "expand_37",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "clone_37",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_465",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_442",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "permute_201",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_464",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "permute_202",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_465",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "permute_203",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_201",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "alias_default_516",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_202",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "alias_default_517",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_203",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "alias_default_518",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_516",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_517",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_518",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_18",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.sdpa",
+      "name": "getitem_162",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.sdpa",
+      "name": "getitem_163",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_18",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.sdpa",
+      "name": "getitem_168",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_18",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.sdpa",
+      "name": "getitem_169",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_162",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.sdpa",
+      "name": "alias_default_519",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_519",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "permute_204",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_204",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_466",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_167",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "dtype_cast_167",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_167",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "permute_205",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_466",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "alias_default_520",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_205",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "alias_default_521",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_520",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_521",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "einsum_default_129",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_507",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_129",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18",
+      "name": "add_91",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_172",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "dtype_cast_168",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_91",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18",
+      "name": "alias_default_522",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_522",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "convert_element_type_446",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_446",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "alias_default_524",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_524",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "pow_38",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_38",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "mean_37",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_37",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "add_92",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_92",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "rsqrt_37",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_37",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "alias_default_525",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_524",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_525",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "mul_130",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_168",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "alias_default_523",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_130",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_523",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "mul_131",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_131",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "convert_element_type_447",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_168",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "dtype_cast_169",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_169",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "permute_206",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_447",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "alias_default_526",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_206",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "alias_default_527",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_526",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_527",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "einsum_default_130",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_130",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "alias_default_528",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_528",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "convert_element_type_450",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_450",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "alias_default_529",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_529",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "neg_18",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "exp_18",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "add_93",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_529",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_93",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "div_18",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "convert_element_type_451",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_170",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "dtype_cast_170",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_170",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "permute_207",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_207",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "alias_default_531",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_526",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_531",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "einsum_default_131",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_451",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "alias_default_530",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_131",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "alias_default_532",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_530",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_532",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "mul_132",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_169",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "dtype_cast_171",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "permute_208",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_132",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "alias_default_533",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_208",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "alias_default_534",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_533",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_534",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "einsum_default_132",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_522",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_132",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18",
+      "name": "add_94",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_180",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "dtype_cast_172",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_94",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18",
+      "name": "alias_default_535",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_535",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "convert_element_type_456",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_456",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "alias_default_537",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_537",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "pow_39",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_39",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "mean_38",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_38",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "add_95",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_95",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "rsqrt_38",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_38",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "alias_default_538",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_537",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_538",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "mul_133",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_172",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "alias_default_536",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_133",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_536",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "mul_134",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_134",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "convert_element_type_457",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_173",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wq",
+      "name": "dtype_cast_173",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_173",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wq",
+      "name": "permute_209",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_457",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "alias_default_539",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_209",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wq",
+      "name": "alias_default_540",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_539",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_540",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wq",
+      "name": "einsum_default_133",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_174",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wk",
+      "name": "dtype_cast_174",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_174",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wk",
+      "name": "permute_210",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_210",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wk",
+      "name": "alias_default_541",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_539",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_541",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wk",
+      "name": "einsum_default_134",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_175",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wv",
+      "name": "dtype_cast_175",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_175",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wv",
+      "name": "permute_211",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_211",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wv",
+      "name": "alias_default_542",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_539",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_542",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wv",
+      "name": "einsum_default_135",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_133",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_481",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_134",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_482",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_135",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_483",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_481",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "convert_element_type_464",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_464",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_484",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_484",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_as_complex_38",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_482",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "convert_element_type_465",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_465",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_485",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_485",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_as_complex_39",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_486",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_486",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "alias_default_543",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_543",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "mul_135",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_135",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_as_real_38",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_487",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_543",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "mul_136",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_136",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_as_real_39",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_488",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_487",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "convert_element_type_466",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_488",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "convert_element_type_467",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_467",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "unsqueeze_38",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "expand_38",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "clone_38",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_489",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_483",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "unsqueeze_39",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "expand_39",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "clone_39",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_490",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_466",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "permute_212",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_489",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "permute_213",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_490",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "permute_214",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_212",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "alias_default_544",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_213",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "alias_default_545",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_214",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "alias_default_546",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_544",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_545",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_546",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_19",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.sdpa",
+      "name": "getitem_171",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.sdpa",
+      "name": "getitem_172",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_19",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.sdpa",
+      "name": "getitem_177",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_19",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.sdpa",
+      "name": "getitem_178",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.sdpa",
+      "name": "alias_default_547",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_547",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "permute_215",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_215",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_491",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_176",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "dtype_cast_176",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_176",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "permute_216",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_491",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "alias_default_548",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_216",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "alias_default_549",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_548",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_549",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "einsum_default_136",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_535",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_136",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19",
+      "name": "add_96",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_181",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "dtype_cast_177",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_96",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19",
+      "name": "alias_default_550",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_550",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "convert_element_type_470",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_470",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "alias_default_552",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_552",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "pow_40",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_40",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "mean_39",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_39",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "add_97",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_97",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "rsqrt_39",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_39",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "alias_default_553",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_552",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_553",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "mul_137",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_177",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "alias_default_551",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_137",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_551",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "mul_138",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_138",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "convert_element_type_471",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_177",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "dtype_cast_178",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_178",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "permute_217",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_471",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "alias_default_554",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_217",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "alias_default_555",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_554",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_555",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "einsum_default_137",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_137",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "alias_default_556",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_556",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "convert_element_type_474",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_474",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "alias_default_557",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_557",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "neg_19",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "exp_19",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "add_98",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_557",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_98",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "div_19",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "convert_element_type_475",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_179",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "dtype_cast_179",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_179",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "permute_218",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_218",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "alias_default_559",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_554",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_559",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "einsum_default_138",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_475",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "alias_default_558",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_138",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "alias_default_560",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_558",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_560",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "mul_139",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_178",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "dtype_cast_180",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_180",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "permute_219",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_139",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "alias_default_561",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_219",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "alias_default_562",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_561",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_562",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "einsum_default_139",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_550",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_139",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19",
+      "name": "add_99",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_189",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "dtype_cast_181",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_99",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19",
+      "name": "alias_default_563",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_563",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "convert_element_type_480",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_480",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "alias_default_565",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_565",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "pow_41",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_41",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "mean_40",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_40",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "add_100",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_100",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "rsqrt_40",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_40",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "alias_default_566",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_565",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_566",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "mul_140",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_181",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "alias_default_564",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_140",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_564",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "mul_141",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_141",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "convert_element_type_481",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_182",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wq",
+      "name": "dtype_cast_182",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_182",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wq",
+      "name": "permute_220",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_481",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "alias_default_567",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_220",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wq",
+      "name": "alias_default_568",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_567",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_568",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wq",
+      "name": "einsum_default_140",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_183",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wk",
+      "name": "dtype_cast_183",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_183",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wk",
+      "name": "permute_221",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_221",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wk",
+      "name": "alias_default_569",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_567",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_569",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wk",
+      "name": "einsum_default_141",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_184",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wv",
+      "name": "dtype_cast_184",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_184",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wv",
+      "name": "permute_222",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_222",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wv",
+      "name": "alias_default_570",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_567",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_570",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wv",
+      "name": "einsum_default_142",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_140",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_506",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_141",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_507",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_142",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_508",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_506",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "convert_element_type_488",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_488",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_509",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_509",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_as_complex_40",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_507",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "convert_element_type_489",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_489",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_510",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_510",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_as_complex_41",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_511",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_511",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "alias_default_571",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_571",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "mul_142",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_142",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_as_real_40",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_512",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_41",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_571",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "mul_143",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_143",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_as_real_41",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_41",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_513",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_512",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "convert_element_type_490",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_513",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "convert_element_type_491",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_491",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "unsqueeze_40",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "expand_40",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "clone_40",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_514",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_508",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "unsqueeze_41",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_41",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "expand_41",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_41",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "clone_41",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_41",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_515",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_490",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "permute_223",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_514",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "permute_224",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_515",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "permute_225",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_223",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "alias_default_572",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_224",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "alias_default_573",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_225",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "alias_default_574",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_572",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_573",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_574",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_20",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.sdpa",
+      "name": "getitem_180",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.sdpa",
+      "name": "getitem_181",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_20",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.sdpa",
+      "name": "getitem_186",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_20",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.sdpa",
+      "name": "getitem_187",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_180",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.sdpa",
+      "name": "alias_default_575",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_575",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "permute_226",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_226",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_516",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_185",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "dtype_cast_185",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_185",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "permute_227",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_516",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "alias_default_576",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_227",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "alias_default_577",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_576",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_577",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "einsum_default_143",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_563",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_143",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20",
+      "name": "add_101",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_190",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "dtype_cast_186",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_101",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20",
+      "name": "alias_default_578",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_578",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "convert_element_type_494",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_494",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "alias_default_580",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_580",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "pow_42",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_42",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "mean_41",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_41",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "add_102",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_102",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "rsqrt_41",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_41",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "alias_default_581",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_580",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_581",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "mul_144",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_186",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "alias_default_579",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_144",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_579",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "mul_145",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_145",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "convert_element_type_495",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_186",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "dtype_cast_187",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_187",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "permute_228",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_495",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "alias_default_582",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_228",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "alias_default_583",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_582",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_583",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "einsum_default_144",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_144",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "alias_default_584",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_584",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "convert_element_type_498",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_498",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "alias_default_585",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_585",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "neg_20",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "exp_20",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "add_103",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_585",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_103",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "div_20",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "convert_element_type_499",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_188",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "dtype_cast_188",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_188",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "permute_229",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_229",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "alias_default_587",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_582",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_587",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "einsum_default_145",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_499",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "alias_default_586",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_145",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "alias_default_588",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_586",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_588",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "mul_146",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_187",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "dtype_cast_189",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_189",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "permute_230",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_146",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "alias_default_589",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_230",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "alias_default_590",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_589",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_590",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "einsum_default_146",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_578",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_146",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20",
+      "name": "add_104",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_198",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "dtype_cast_190",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_104",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20",
+      "name": "alias_default_591",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_591",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "convert_element_type_504",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_504",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "alias_default_593",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_593",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "pow_43",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_43",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "mean_42",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_42",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "add_105",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_105",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "rsqrt_42",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_42",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "alias_default_594",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_593",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_594",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "mul_147",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_190",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "alias_default_592",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_147",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_592",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "mul_148",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_148",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "convert_element_type_505",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_191",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wq",
+      "name": "dtype_cast_191",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_191",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wq",
+      "name": "permute_231",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_505",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "alias_default_595",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_231",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wq",
+      "name": "alias_default_596",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_595",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_596",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wq",
+      "name": "einsum_default_147",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_192",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wk",
+      "name": "dtype_cast_192",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_192",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wk",
+      "name": "permute_232",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_232",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wk",
+      "name": "alias_default_597",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_595",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_597",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wk",
+      "name": "einsum_default_148",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_193",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wv",
+      "name": "dtype_cast_193",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_193",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wv",
+      "name": "permute_233",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_233",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wv",
+      "name": "alias_default_598",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_595",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_598",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wv",
+      "name": "einsum_default_149",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_147",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_531",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_148",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_532",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_149",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_533",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_531",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "convert_element_type_512",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_512",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_534",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_534",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_as_complex_42",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_532",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "convert_element_type_513",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_513",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_535",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_535",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_as_complex_43",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_536",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_536",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "alias_default_599",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_599",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "mul_149",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_149",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_as_real_42",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_537",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_599",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "mul_150",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_150",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_as_real_43",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_538",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_537",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "convert_element_type_514",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_538",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "convert_element_type_515",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_515",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "unsqueeze_42",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "expand_42",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "clone_42",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_539",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_533",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "unsqueeze_43",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "expand_43",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "clone_43",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_540",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_514",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "permute_234",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_539",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "permute_235",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_540",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "permute_236",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_234",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "alias_default_600",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_235",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "alias_default_601",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_236",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "alias_default_602",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_600",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_601",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_602",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_21",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.sdpa",
+      "name": "getitem_189",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.sdpa",
+      "name": "getitem_190",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_21",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.sdpa",
+      "name": "getitem_195",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_21",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.sdpa",
+      "name": "getitem_196",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_189",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.sdpa",
+      "name": "alias_default_603",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_603",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "permute_237",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_237",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_541",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_194",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "dtype_cast_194",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_194",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "permute_238",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_541",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "alias_default_604",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_238",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "alias_default_605",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_604",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_605",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "einsum_default_150",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_591",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_150",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21",
+      "name": "add_106",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_199",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "dtype_cast_195",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_106",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21",
+      "name": "alias_default_606",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_606",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "convert_element_type_518",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_518",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "alias_default_608",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_608",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "pow_44",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_44",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "mean_43",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_43",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "add_107",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_107",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "rsqrt_43",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_43",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "alias_default_609",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_608",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_609",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "mul_151",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_195",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "alias_default_607",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_151",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_607",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "mul_152",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_152",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "convert_element_type_519",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_195",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "dtype_cast_196",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_196",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "permute_239",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_519",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "alias_default_610",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_239",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "alias_default_611",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_610",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_611",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "einsum_default_151",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_151",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "alias_default_612",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_612",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "convert_element_type_522",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_522",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "alias_default_613",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_613",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "neg_21",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "exp_21",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "add_108",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_613",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_108",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "div_21",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "convert_element_type_523",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_197",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "dtype_cast_197",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_197",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "permute_240",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_240",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "alias_default_615",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_610",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_615",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "einsum_default_152",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_523",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "alias_default_614",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_152",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "alias_default_616",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_614",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_616",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "mul_153",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_196",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "dtype_cast_198",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_198",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "permute_241",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_153",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "alias_default_617",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_241",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "alias_default_618",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_617",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_618",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "einsum_default_153",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_606",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_153",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21",
+      "name": "add_109",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_207",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "dtype_cast_199",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_109",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21",
+      "name": "alias_default_619",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_619",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "convert_element_type_528",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_528",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "alias_default_621",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_621",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "pow_45",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_45",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "mean_44",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_44",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "add_110",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_110",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "rsqrt_44",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_44",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "alias_default_622",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_621",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_622",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "mul_154",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_199",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "alias_default_620",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_154",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_620",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "mul_155",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_155",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "convert_element_type_529",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_200",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wq",
+      "name": "dtype_cast_200",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_200",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wq",
+      "name": "permute_242",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_529",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "alias_default_623",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_242",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wq",
+      "name": "alias_default_624",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_623",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_624",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wq",
+      "name": "einsum_default_154",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_201",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wk",
+      "name": "dtype_cast_201",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_201",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wk",
+      "name": "permute_243",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_243",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wk",
+      "name": "alias_default_625",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_623",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_625",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wk",
+      "name": "einsum_default_155",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_202",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wv",
+      "name": "dtype_cast_202",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_202",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wv",
+      "name": "permute_244",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_244",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wv",
+      "name": "alias_default_626",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_623",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_626",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wv",
+      "name": "einsum_default_156",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_154",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_556",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_155",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_557",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_156",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_558",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_556",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "convert_element_type_536",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_536",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_559",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_559",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_as_complex_44",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_557",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "convert_element_type_537",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_537",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_560",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_560",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_as_complex_45",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_561",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_561",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "alias_default_627",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_44",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_627",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "mul_156",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_156",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_as_real_44",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_44",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_562",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_45",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_627",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "mul_157",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_157",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_as_real_45",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_45",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_563",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_562",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "convert_element_type_538",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_563",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "convert_element_type_539",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_539",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "unsqueeze_44",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_44",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "expand_44",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_44",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "clone_44",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_44",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_564",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_558",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "unsqueeze_45",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_45",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "expand_45",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_45",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "clone_45",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_45",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_565",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_538",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "permute_245",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_564",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "permute_246",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_565",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "permute_247",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_245",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "alias_default_628",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_246",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "alias_default_629",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_247",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "alias_default_630",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_628",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_629",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_630",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_22",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_22",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.sdpa",
+      "name": "getitem_198",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_22",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.sdpa",
+      "name": "getitem_199",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_22",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.sdpa",
+      "name": "getitem_204",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_22",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.sdpa",
+      "name": "getitem_205",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_198",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.sdpa",
+      "name": "alias_default_631",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_631",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "permute_248",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_248",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_566",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_203",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "dtype_cast_203",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_203",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "permute_249",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_566",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "alias_default_632",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_249",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "alias_default_633",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_632",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_633",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "einsum_default_157",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_619",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_157",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22",
+      "name": "add_111",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_208",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "dtype_cast_204",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_111",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22",
+      "name": "alias_default_634",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_634",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "convert_element_type_542",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_542",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "alias_default_636",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_636",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "pow_46",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_46",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "mean_45",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_45",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "add_112",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_112",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "rsqrt_45",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_45",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "alias_default_637",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_636",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_637",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "mul_158",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_204",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "alias_default_635",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_158",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_635",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "mul_159",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_159",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "convert_element_type_543",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_204",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "dtype_cast_205",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_205",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "permute_250",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_543",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "alias_default_638",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_250",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "alias_default_639",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_638",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_639",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "einsum_default_158",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_158",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "alias_default_640",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_640",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "convert_element_type_546",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_546",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "alias_default_641",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_641",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "neg_22",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "exp_22",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "add_113",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_641",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_113",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "div_22",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "convert_element_type_547",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_206",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "dtype_cast_206",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_206",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "permute_251",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_251",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "alias_default_643",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_638",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_643",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "einsum_default_159",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_547",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "alias_default_642",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_159",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "alias_default_644",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_642",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_644",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "mul_160",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_205",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "dtype_cast_207",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_207",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "permute_252",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_160",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "alias_default_645",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_252",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "alias_default_646",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_645",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_646",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "einsum_default_160",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_634",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_160",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22",
+      "name": "add_114",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_216",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "dtype_cast_208",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_114",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22",
+      "name": "alias_default_647",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_647",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "convert_element_type_552",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_552",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "alias_default_649",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_649",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "pow_47",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_47",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "mean_46",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_46",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "add_115",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "rsqrt_46",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_46",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "alias_default_650",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_649",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_650",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "mul_161",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_208",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "alias_default_648",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_161",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_648",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "mul_162",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_162",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "convert_element_type_553",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_209",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wq",
+      "name": "dtype_cast_209",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_209",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wq",
+      "name": "permute_253",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_553",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "alias_default_651",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_253",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wq",
+      "name": "alias_default_652",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_651",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_652",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wq",
+      "name": "einsum_default_161",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_210",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wk",
+      "name": "dtype_cast_210",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_210",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wk",
+      "name": "permute_254",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_254",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wk",
+      "name": "alias_default_653",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_651",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_653",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wk",
+      "name": "einsum_default_162",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_211",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wv",
+      "name": "dtype_cast_211",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_211",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wv",
+      "name": "permute_255",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_255",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wv",
+      "name": "alias_default_654",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_651",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_654",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wv",
+      "name": "einsum_default_163",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_161",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_581",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_162",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_582",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_163",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_583",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_581",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "convert_element_type_560",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_560",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_584",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_584",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_as_complex_46",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_582",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "convert_element_type_561",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_561",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_585",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_585",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_as_complex_47",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_586",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_586",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "alias_default_655",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_46",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_655",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "mul_163",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_163",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_as_real_46",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_46",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_587",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_47",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_655",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "mul_164",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_164",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_as_real_47",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_47",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_588",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_587",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "convert_element_type_562",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_588",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "convert_element_type_563",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_563",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "unsqueeze_46",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_46",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "expand_46",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_46",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "clone_46",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_46",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_589",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_583",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "unsqueeze_47",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_47",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "expand_47",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_47",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "clone_47",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_47",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_590",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_562",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "permute_256",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_589",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "permute_257",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_590",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "permute_258",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_256",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "alias_default_656",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_257",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "alias_default_657",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_258",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "alias_default_658",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_656",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_657",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_658",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_23",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_23",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.sdpa",
+      "name": "getitem_207",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_23",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.sdpa",
+      "name": "getitem_208",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_23",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.sdpa",
+      "name": "getitem_213",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_23",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.sdpa",
+      "name": "getitem_214",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_207",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.sdpa",
+      "name": "alias_default_659",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_659",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "permute_259",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_259",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_591",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_212",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "dtype_cast_212",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_212",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "permute_260",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_591",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "alias_default_660",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_260",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "alias_default_661",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_660",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_661",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "einsum_default_164",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_647",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_164",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23",
+      "name": "add_116",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_217",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "dtype_cast_213",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_116",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23",
+      "name": "alias_default_662",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_662",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "convert_element_type_566",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_566",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "alias_default_664",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_664",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "pow_48",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_48",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "mean_47",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_47",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "add_117",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_117",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "rsqrt_47",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_47",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "alias_default_665",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_664",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_665",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "mul_165",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_213",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "alias_default_663",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_165",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_663",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "mul_166",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_166",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "convert_element_type_567",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_213",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "dtype_cast_214",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_214",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "permute_261",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_567",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "alias_default_666",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_261",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "alias_default_667",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_666",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_667",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "einsum_default_165",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_165",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "alias_default_668",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_668",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "convert_element_type_570",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_570",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "alias_default_669",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_669",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "neg_23",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "exp_23",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "add_118",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_669",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_118",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "div_23",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "convert_element_type_571",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_215",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "dtype_cast_215",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_215",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "permute_262",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_262",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "alias_default_671",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_666",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_671",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "einsum_default_166",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_571",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "alias_default_670",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_166",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "alias_default_672",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_670",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_672",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "mul_167",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_214",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "dtype_cast_216",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_216",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "permute_263",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_167",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "alias_default_673",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_263",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "alias_default_674",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_673",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_674",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "einsum_default_167",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_662",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_167",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23",
+      "name": "add_119",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_225",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "dtype_cast_217",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_119",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23",
+      "name": "alias_default_675",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_675",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "convert_element_type_576",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_576",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "alias_default_677",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_677",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "pow_49",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "mean_48",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_48",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "add_120",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_120",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "rsqrt_48",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_48",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "alias_default_678",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_677",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_678",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "mul_168",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_217",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "alias_default_676",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_168",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_676",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "mul_169",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_169",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "convert_element_type_577",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_218",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wq",
+      "name": "dtype_cast_218",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_218",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wq",
+      "name": "permute_264",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_577",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "alias_default_679",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_264",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wq",
+      "name": "alias_default_680",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_679",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_680",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wq",
+      "name": "einsum_default_168",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_219",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wk",
+      "name": "dtype_cast_219",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_219",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wk",
+      "name": "permute_265",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_265",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wk",
+      "name": "alias_default_681",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_679",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_681",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wk",
+      "name": "einsum_default_169",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_220",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wv",
+      "name": "dtype_cast_220",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_220",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wv",
+      "name": "permute_266",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_266",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wv",
+      "name": "alias_default_682",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_679",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_682",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wv",
+      "name": "einsum_default_170",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_168",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_606",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_169",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_607",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_170",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_608",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_606",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "convert_element_type_584",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_584",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_609",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_609",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_as_complex_48",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_607",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "convert_element_type_585",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_585",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_610",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_610",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_as_complex_49",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_611",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_611",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "alias_default_683",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_683",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "mul_170",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_170",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_as_real_48",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_612",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_49",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_683",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "mul_171",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_171",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_as_real_49",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_49",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_613",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_612",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "convert_element_type_586",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_613",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "convert_element_type_587",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_587",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "unsqueeze_48",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "expand_48",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "clone_48",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_614",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_608",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "unsqueeze_49",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_49",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "expand_49",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_49",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "clone_49",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_49",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_615",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_586",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "permute_267",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_614",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "permute_268",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_615",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "permute_269",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_267",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "alias_default_684",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_268",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "alias_default_685",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_269",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "alias_default_686",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_684",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_685",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_686",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_24",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.sdpa",
+      "name": "getitem_216",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.sdpa",
+      "name": "getitem_217",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_24",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.sdpa",
+      "name": "getitem_222",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_24",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.sdpa",
+      "name": "getitem_223",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_216",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.sdpa",
+      "name": "alias_default_687",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_687",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "permute_270",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_270",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_616",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_221",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "dtype_cast_221",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_221",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "permute_271",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_616",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "alias_default_688",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_271",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "alias_default_689",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_688",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_689",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "einsum_default_171",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_675",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24",
+      "name": "add_121",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_226",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "dtype_cast_222",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_121",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24",
+      "name": "alias_default_690",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_690",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "convert_element_type_590",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_590",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "alias_default_692",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_692",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "pow_50",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_50",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "mean_49",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "add_122",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_122",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "rsqrt_49",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "alias_default_693",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_692",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_693",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "mul_172",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_222",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "alias_default_691",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_172",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_691",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "mul_173",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_173",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "convert_element_type_591",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_222",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "dtype_cast_223",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_223",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "permute_272",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_591",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "alias_default_694",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_272",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "alias_default_695",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_694",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_695",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "einsum_default_172",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_172",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "alias_default_696",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_696",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "convert_element_type_594",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_594",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "alias_default_697",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_697",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "neg_24",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "exp_24",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "add_123",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_697",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_123",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "div_24",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "convert_element_type_595",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_224",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "dtype_cast_224",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_224",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "permute_273",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_273",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "alias_default_699",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_694",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_699",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "einsum_default_173",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_595",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "alias_default_698",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_173",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "alias_default_700",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_698",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_700",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "mul_174",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_223",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "dtype_cast_225",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_225",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "permute_274",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_174",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "alias_default_701",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_274",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "alias_default_702",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_701",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_702",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "einsum_default_174",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_690",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_174",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24",
+      "name": "add_124",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_234",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "dtype_cast_226",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_124",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24",
+      "name": "alias_default_703",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_703",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "convert_element_type_600",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_600",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "alias_default_705",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_705",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "pow_51",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_51",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "mean_50",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_50",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "add_125",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_125",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "rsqrt_50",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_50",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "alias_default_706",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_705",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_706",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "mul_175",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_226",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "alias_default_704",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_175",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_704",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "mul_176",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_176",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "convert_element_type_601",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_227",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wq",
+      "name": "dtype_cast_227",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_227",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wq",
+      "name": "permute_275",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_601",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "alias_default_707",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_275",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wq",
+      "name": "alias_default_708",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_707",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_708",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wq",
+      "name": "einsum_default_175",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_228",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wk",
+      "name": "dtype_cast_228",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_228",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wk",
+      "name": "permute_276",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_276",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wk",
+      "name": "alias_default_709",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_707",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_709",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wk",
+      "name": "einsum_default_176",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_229",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wv",
+      "name": "dtype_cast_229",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_229",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wv",
+      "name": "permute_277",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_277",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wv",
+      "name": "alias_default_710",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_707",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_710",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wv",
+      "name": "einsum_default_177",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_175",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_631",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_176",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_632",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_177",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_633",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_631",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "convert_element_type_608",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_608",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_634",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_634",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_as_complex_50",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_632",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "convert_element_type_609",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_609",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_635",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_635",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_as_complex_51",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_636",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_636",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "alias_default_711",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_50",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_711",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "mul_177",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_177",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_as_real_50",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_50",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_637",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_51",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_711",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "mul_178",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_178",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_as_real_51",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_51",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_638",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_637",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "convert_element_type_610",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_638",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "convert_element_type_611",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_611",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "unsqueeze_50",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_50",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "expand_50",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_50",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "clone_50",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_50",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_639",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_633",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "unsqueeze_51",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_51",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "expand_51",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_51",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "clone_51",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_51",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_640",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_610",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "permute_278",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_639",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "permute_279",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_640",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "permute_280",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_278",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "alias_default_712",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_279",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "alias_default_713",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_280",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "alias_default_714",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_712",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_713",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_714",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_25",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.sdpa",
+      "name": "getitem_225",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.sdpa",
+      "name": "getitem_226",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_25",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.sdpa",
+      "name": "getitem_231",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_25",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.sdpa",
+      "name": "getitem_232",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_225",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.sdpa",
+      "name": "alias_default_715",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_715",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "permute_281",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_281",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_641",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_230",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "dtype_cast_230",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_230",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "permute_282",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_641",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "alias_default_716",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_282",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "alias_default_717",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_716",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_717",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "einsum_default_178",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_703",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_178",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25",
+      "name": "add_126",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_235",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "dtype_cast_231",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_126",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25",
+      "name": "alias_default_718",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_718",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "convert_element_type_614",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_614",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "alias_default_720",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_720",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "pow_52",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_52",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "mean_51",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_51",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "add_127",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_127",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "rsqrt_51",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_51",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "alias_default_721",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_720",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_721",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "mul_179",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_231",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "alias_default_719",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_179",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_719",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "mul_180",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_180",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "convert_element_type_615",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_231",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "dtype_cast_232",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_232",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "permute_283",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_615",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "alias_default_722",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_283",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "alias_default_723",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_722",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_723",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "einsum_default_179",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_179",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "alias_default_724",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_724",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "convert_element_type_618",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_618",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "alias_default_725",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_725",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "neg_25",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "exp_25",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "add_128",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_725",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_128",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "div_25",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "convert_element_type_619",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_233",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "dtype_cast_233",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_233",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "permute_284",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_284",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "alias_default_727",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_722",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_727",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "einsum_default_180",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_619",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "alias_default_726",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_180",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "alias_default_728",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_726",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_728",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "mul_181",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_232",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "dtype_cast_234",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_234",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "permute_285",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_181",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "alias_default_729",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_285",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "alias_default_730",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_729",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_730",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "einsum_default_181",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_718",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_181",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25",
+      "name": "add_129",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_243",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "dtype_cast_235",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_129",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25",
+      "name": "alias_default_731",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_731",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "convert_element_type_624",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_624",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "alias_default_733",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_733",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "pow_53",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_53",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "mean_52",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_52",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "add_130",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_130",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "rsqrt_52",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_52",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "alias_default_734",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_733",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_734",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "mul_182",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_235",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "alias_default_732",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_182",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_732",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "mul_183",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_183",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "convert_element_type_625",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_236",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wq",
+      "name": "dtype_cast_236",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_236",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wq",
+      "name": "permute_286",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_625",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "alias_default_735",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_286",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wq",
+      "name": "alias_default_736",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_735",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_736",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wq",
+      "name": "einsum_default_182",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_237",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wk",
+      "name": "dtype_cast_237",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_237",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wk",
+      "name": "permute_287",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_287",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wk",
+      "name": "alias_default_737",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_735",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_737",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wk",
+      "name": "einsum_default_183",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_238",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wv",
+      "name": "dtype_cast_238",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_238",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wv",
+      "name": "permute_288",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_288",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wv",
+      "name": "alias_default_738",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_735",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_738",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wv",
+      "name": "einsum_default_184",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_182",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_656",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_183",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_657",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_184",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_658",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_656",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "convert_element_type_632",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_632",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_659",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_659",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_as_complex_52",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_657",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "convert_element_type_633",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_633",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_660",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_660",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_as_complex_53",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_661",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_661",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "alias_default_739",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_739",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "mul_184",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_184",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_as_real_52",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_662",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_739",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "mul_185",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_185",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_as_real_53",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_663",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_662",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "convert_element_type_634",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_663",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "convert_element_type_635",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_635",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "unsqueeze_52",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "expand_52",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "clone_52",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_664",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_658",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "unsqueeze_53",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "expand_53",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "clone_53",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_665",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_634",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "permute_289",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_664",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "permute_290",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_665",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "permute_291",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_289",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "alias_default_740",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_290",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "alias_default_741",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_291",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "alias_default_742",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_740",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_741",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_742",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_26",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.sdpa",
+      "name": "getitem_234",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.sdpa",
+      "name": "getitem_235",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_26",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.sdpa",
+      "name": "getitem_240",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_26",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.sdpa",
+      "name": "getitem_241",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_234",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.sdpa",
+      "name": "alias_default_743",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_743",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "permute_292",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_292",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_666",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_239",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "dtype_cast_239",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_239",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "permute_293",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_666",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "alias_default_744",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_293",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "alias_default_745",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_744",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_745",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "einsum_default_185",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_731",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_185",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26",
+      "name": "add_131",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_244",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "dtype_cast_240",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_131",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26",
+      "name": "alias_default_746",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_746",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "convert_element_type_638",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_638",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "alias_default_748",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_748",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "pow_54",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_54",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "mean_53",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_53",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "add_132",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_132",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "rsqrt_53",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_53",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "alias_default_749",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_748",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_749",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "mul_186",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_240",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "alias_default_747",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_186",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_747",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "mul_187",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_187",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "convert_element_type_639",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_240",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "dtype_cast_241",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_241",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "permute_294",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_639",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "alias_default_750",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_294",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "alias_default_751",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_750",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_751",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "einsum_default_186",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_186",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "alias_default_752",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_752",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "convert_element_type_642",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_642",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "alias_default_753",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_753",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "neg_26",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "exp_26",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "add_133",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_753",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_133",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "div_26",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "convert_element_type_643",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_242",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "dtype_cast_242",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_242",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "permute_295",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_295",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "alias_default_755",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_750",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_755",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "einsum_default_187",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_643",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "alias_default_754",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_187",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "alias_default_756",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_754",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_756",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "mul_188",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_241",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "dtype_cast_243",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_243",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "permute_296",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_188",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "alias_default_757",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_296",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "alias_default_758",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_757",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_758",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "einsum_default_188",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_746",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_188",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26",
+      "name": "add_134",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_252",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "dtype_cast_244",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_134",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26",
+      "name": "alias_default_759",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_759",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "convert_element_type_648",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_648",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "alias_default_761",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_761",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "pow_55",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_55",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "mean_54",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_54",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "add_135",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_135",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "rsqrt_54",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_54",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "alias_default_762",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_761",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_762",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "mul_189",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_244",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "alias_default_760",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_189",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_760",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "mul_190",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_190",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "convert_element_type_649",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_245",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wq",
+      "name": "dtype_cast_245",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_245",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wq",
+      "name": "permute_297",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_649",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "alias_default_763",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_297",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wq",
+      "name": "alias_default_764",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_763",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_764",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wq",
+      "name": "einsum_default_189",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_246",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wk",
+      "name": "dtype_cast_246",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_246",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wk",
+      "name": "permute_298",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_298",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wk",
+      "name": "alias_default_765",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_763",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_765",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wk",
+      "name": "einsum_default_190",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_247",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wv",
+      "name": "dtype_cast_247",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_247",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wv",
+      "name": "permute_299",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_299",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wv",
+      "name": "alias_default_766",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_763",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_766",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wv",
+      "name": "einsum_default_191",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_189",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_681",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_190",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_682",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_191",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_683",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_681",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "convert_element_type_656",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_656",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_684",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_684",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_as_complex_54",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_682",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "convert_element_type_657",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_657",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_685",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_685",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_as_complex_55",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_686",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_686",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "alias_default_767",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_767",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "mul_191",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_191",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_as_real_54",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_687",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_55",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_767",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "mul_192",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_192",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_as_real_55",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_55",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_688",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_687",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "convert_element_type_658",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_688",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "convert_element_type_659",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_659",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "unsqueeze_54",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "expand_54",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "clone_54",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_689",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_683",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "unsqueeze_55",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_55",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "expand_55",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_55",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "clone_55",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_55",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_690",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_658",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "permute_300",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_689",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "permute_301",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_690",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "permute_302",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_300",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "alias_default_768",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_301",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "alias_default_769",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_302",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "alias_default_770",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_768",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_769",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_770",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_27",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.sdpa",
+      "name": "getitem_243",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.sdpa",
+      "name": "getitem_244",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_27",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.sdpa",
+      "name": "getitem_249",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_27",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.sdpa",
+      "name": "getitem_250",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_243",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.sdpa",
+      "name": "alias_default_771",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_771",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "permute_303",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_303",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_691",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_248",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "dtype_cast_248",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_248",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "permute_304",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_691",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "alias_default_772",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_304",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "alias_default_773",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_772",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_773",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "einsum_default_192",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_759",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_192",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27",
+      "name": "add_136",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_253",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "dtype_cast_249",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_136",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27",
+      "name": "alias_default_774",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_774",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "convert_element_type_662",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_662",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "alias_default_776",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_776",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "pow_56",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_56",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "mean_55",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_55",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "add_137",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_137",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "rsqrt_55",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_55",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "alias_default_777",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_776",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_777",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "mul_193",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_249",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "alias_default_775",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_193",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_775",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "mul_194",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_194",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "convert_element_type_663",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_249",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "dtype_cast_250",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_250",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "permute_305",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_663",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "alias_default_778",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_305",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "alias_default_779",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_778",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_779",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "einsum_default_193",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_193",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "alias_default_780",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_780",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "convert_element_type_666",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_666",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "alias_default_781",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_781",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "neg_27",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "exp_27",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "add_138",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_781",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_138",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "div_27",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "convert_element_type_667",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_251",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "dtype_cast_251",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_251",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "permute_306",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_306",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "alias_default_783",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_778",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_783",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "einsum_default_194",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_667",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "alias_default_782",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_194",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "alias_default_784",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_782",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_784",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "mul_195",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_250",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "dtype_cast_252",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_252",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "permute_307",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_195",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "alias_default_785",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_307",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "alias_default_786",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_785",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_786",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "einsum_default_195",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_774",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_195",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27",
+      "name": "add_139",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_261",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "dtype_cast_253",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_139",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27",
+      "name": "alias_default_787",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_787",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "convert_element_type_672",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_672",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "alias_default_789",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_789",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "pow_57",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_57",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "mean_56",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_56",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "add_140",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_140",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "rsqrt_56",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_56",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "alias_default_790",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_789",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_790",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "mul_196",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_253",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "alias_default_788",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_196",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_788",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "mul_197",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_197",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "convert_element_type_673",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_254",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wq",
+      "name": "dtype_cast_254",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_254",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wq",
+      "name": "permute_308",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_673",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "alias_default_791",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_308",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wq",
+      "name": "alias_default_792",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_791",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_792",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wq",
+      "name": "einsum_default_196",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_255",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wk",
+      "name": "dtype_cast_255",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_255",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wk",
+      "name": "permute_309",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_309",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wk",
+      "name": "alias_default_793",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_791",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_793",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wk",
+      "name": "einsum_default_197",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_256",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wv",
+      "name": "dtype_cast_256",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_256",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wv",
+      "name": "permute_310",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_310",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wv",
+      "name": "alias_default_794",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_791",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_794",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wv",
+      "name": "einsum_default_198",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_196",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_706",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_197",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_707",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_198",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_708",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_706",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "convert_element_type_680",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_680",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_709",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_709",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_as_complex_56",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_707",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "convert_element_type_681",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_681",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_710",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_710",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_as_complex_57",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_711",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_711",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "alias_default_795",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_795",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "mul_198",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_198",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_as_real_56",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_712",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_795",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "mul_199",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_199",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_as_real_57",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_713",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_712",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "convert_element_type_682",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_713",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "convert_element_type_683",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_683",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "unsqueeze_56",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "expand_56",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "clone_56",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_714",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_708",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "unsqueeze_57",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "expand_57",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "clone_57",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_715",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_682",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "permute_311",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_714",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "permute_312",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_715",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "permute_313",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_311",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "alias_default_796",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_312",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "alias_default_797",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_313",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "alias_default_798",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_796",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_797",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_798",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_28",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_28",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.sdpa",
+      "name": "getitem_252",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_28",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.sdpa",
+      "name": "getitem_253",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_28",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.sdpa",
+      "name": "getitem_258",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_28",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.sdpa",
+      "name": "getitem_259",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_252",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.sdpa",
+      "name": "alias_default_799",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_799",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "permute_314",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_314",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_716",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_257",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "dtype_cast_257",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_257",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "permute_315",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_716",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "alias_default_800",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_315",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "alias_default_801",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_800",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_801",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "einsum_default_199",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_787",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_199",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28",
+      "name": "add_141",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_262",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "dtype_cast_258",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_141",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28",
+      "name": "alias_default_802",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_802",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "convert_element_type_686",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_686",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "alias_default_804",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_804",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "pow_58",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_58",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "mean_57",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_57",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "add_142",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_142",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "rsqrt_57",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_57",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "alias_default_805",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_804",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_805",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "mul_200",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_258",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "alias_default_803",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_200",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_803",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "mul_201",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_201",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "convert_element_type_687",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_258",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "dtype_cast_259",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_259",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "permute_316",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_687",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "alias_default_806",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_316",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "alias_default_807",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_806",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_807",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "einsum_default_200",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_200",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "alias_default_808",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_808",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "convert_element_type_690",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_690",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "alias_default_809",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_809",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "neg_28",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "exp_28",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "add_143",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_809",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_143",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "div_28",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "convert_element_type_691",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_260",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "dtype_cast_260",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_260",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "permute_317",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_317",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "alias_default_811",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_806",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_811",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "einsum_default_201",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_691",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "alias_default_810",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_201",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "alias_default_812",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_810",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_812",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "mul_202",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_259",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "dtype_cast_261",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_261",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "permute_318",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_202",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "alias_default_813",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_318",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "alias_default_814",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_813",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_814",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "einsum_default_202",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_802",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_202",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28",
+      "name": "add_144",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_270",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "dtype_cast_262",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_144",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28",
+      "name": "alias_default_815",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_815",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "convert_element_type_696",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_696",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "alias_default_817",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_817",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "pow_59",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "mean_58",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_58",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "add_145",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_145",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "rsqrt_58",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_58",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "alias_default_818",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_817",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_818",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "mul_203",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_262",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "alias_default_816",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_203",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_816",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "mul_204",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_204",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "convert_element_type_697",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_263",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wq",
+      "name": "dtype_cast_263",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_263",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wq",
+      "name": "permute_319",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_697",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "alias_default_819",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_319",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wq",
+      "name": "alias_default_820",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_819",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_820",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wq",
+      "name": "einsum_default_203",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_264",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wk",
+      "name": "dtype_cast_264",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_264",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wk",
+      "name": "permute_320",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_320",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wk",
+      "name": "alias_default_821",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_819",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_821",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wk",
+      "name": "einsum_default_204",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_265",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wv",
+      "name": "dtype_cast_265",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_265",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wv",
+      "name": "permute_321",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_321",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wv",
+      "name": "alias_default_822",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_819",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_822",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wv",
+      "name": "einsum_default_205",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_203",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_731",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_204",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_732",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_205",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_733",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_731",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "convert_element_type_704",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_704",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_734",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_734",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_as_complex_58",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_732",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "convert_element_type_705",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_705",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_735",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_735",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_as_complex_59",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_736",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_736",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "alias_default_823",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_823",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "mul_205",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_205",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_as_real_58",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_737",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_823",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "mul_206",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_206",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_as_real_59",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_738",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_737",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "convert_element_type_706",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_738",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "convert_element_type_707",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_707",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "unsqueeze_58",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "expand_58",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "clone_58",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_739",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_733",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "unsqueeze_59",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "expand_59",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "clone_59",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_740",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_706",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "permute_322",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_739",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "permute_323",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_740",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "permute_324",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_322",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "alias_default_824",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_323",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "alias_default_825",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_324",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "alias_default_826",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_824",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_825",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_826",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_29",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.sdpa",
+      "name": "getitem_261",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.sdpa",
+      "name": "getitem_262",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_29",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.sdpa",
+      "name": "getitem_267",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_29",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.sdpa",
+      "name": "getitem_268",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_261",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.sdpa",
+      "name": "alias_default_827",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_827",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "permute_325",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_325",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_741",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_266",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "dtype_cast_266",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_266",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "permute_326",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_741",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "alias_default_828",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_326",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "alias_default_829",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_828",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_829",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "einsum_default_206",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_815",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_206",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29",
+      "name": "add_146",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_271",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "dtype_cast_267",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_146",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29",
+      "name": "alias_default_830",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_830",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "convert_element_type_710",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_710",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "alias_default_832",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_832",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "pow_60",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_60",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "mean_59",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "add_147",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_147",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "rsqrt_59",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "alias_default_833",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_832",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_833",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "mul_207",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_267",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "alias_default_831",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_207",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_831",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "mul_208",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_208",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "convert_element_type_711",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_267",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "dtype_cast_268",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_268",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "permute_327",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_711",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "alias_default_834",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_327",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "alias_default_835",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_834",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_835",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "einsum_default_207",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_207",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "alias_default_836",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_836",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "convert_element_type_714",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_714",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "alias_default_837",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_837",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "neg_29",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "exp_29",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "add_148",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_837",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_148",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "div_29",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "convert_element_type_715",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_269",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "dtype_cast_269",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_269",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "permute_328",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_328",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "alias_default_839",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_834",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_839",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "einsum_default_208",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_715",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "alias_default_838",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_208",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "alias_default_840",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_838",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_840",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "mul_209",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_268",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "dtype_cast_270",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_270",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "permute_329",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_209",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "alias_default_841",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_329",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "alias_default_842",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_841",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_842",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "einsum_default_209",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_830",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_209",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29",
+      "name": "add_149",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_279",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "dtype_cast_271",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_149",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29",
+      "name": "alias_default_843",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_843",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "convert_element_type_720",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_720",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "alias_default_845",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_845",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "pow_61",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_61",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "mean_60",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_60",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "add_150",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_150",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "rsqrt_60",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_60",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "alias_default_846",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_845",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_846",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "mul_210",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_271",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "alias_default_844",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_210",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_844",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "mul_211",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_211",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "convert_element_type_721",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_272",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wq",
+      "name": "dtype_cast_272",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_272",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wq",
+      "name": "permute_330",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_721",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "alias_default_847",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_330",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wq",
+      "name": "alias_default_848",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_847",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_848",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wq",
+      "name": "einsum_default_210",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_273",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wk",
+      "name": "dtype_cast_273",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_273",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wk",
+      "name": "permute_331",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_331",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wk",
+      "name": "alias_default_849",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_847",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_849",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wk",
+      "name": "einsum_default_211",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_274",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wv",
+      "name": "dtype_cast_274",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_274",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wv",
+      "name": "permute_332",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_332",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wv",
+      "name": "alias_default_850",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_847",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_850",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wv",
+      "name": "einsum_default_212",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_210",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_756",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_211",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_757",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_212",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_758",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_756",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "convert_element_type_728",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_728",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_759",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_759",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_as_complex_60",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_757",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "convert_element_type_729",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_729",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_760",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_760",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_as_complex_61",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_761",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_761",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "alias_default_851",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_851",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "mul_212",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_212",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_as_real_60",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_762",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_851",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "mul_213",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_213",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_as_real_61",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_763",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_762",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "convert_element_type_730",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_763",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "convert_element_type_731",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_731",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "unsqueeze_60",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "expand_60",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "clone_60",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_764",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_758",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "unsqueeze_61",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "expand_61",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "clone_61",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_765",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_730",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "permute_333",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_764",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "permute_334",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_765",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "permute_335",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_333",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "alias_default_852",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_334",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "alias_default_853",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_335",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "alias_default_854",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_852",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_853",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_854",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_30",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_30",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.sdpa",
+      "name": "getitem_270",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_30",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.sdpa",
+      "name": "getitem_271",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_30",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.sdpa",
+      "name": "getitem_276",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_30",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.sdpa",
+      "name": "getitem_277",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_270",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.sdpa",
+      "name": "alias_default_855",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_855",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "permute_336",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_336",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_766",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_275",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "dtype_cast_275",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_275",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "permute_337",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_766",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "alias_default_856",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_337",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "alias_default_857",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_856",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_857",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "einsum_default_213",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_843",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_213",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30",
+      "name": "add_151",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_280",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "dtype_cast_276",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_151",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30",
+      "name": "alias_default_858",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_858",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "convert_element_type_734",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_734",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "alias_default_860",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_860",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "pow_62",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_62",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "mean_61",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_61",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "add_152",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_152",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "rsqrt_61",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_61",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "alias_default_861",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_860",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_861",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "mul_214",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_276",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "alias_default_859",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_214",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_859",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "mul_215",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_215",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "convert_element_type_735",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_276",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "dtype_cast_277",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_277",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "permute_338",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_735",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "alias_default_862",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_338",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "alias_default_863",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_862",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_863",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "einsum_default_214",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_214",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "alias_default_864",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_864",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "convert_element_type_738",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_738",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "alias_default_865",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_865",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "neg_30",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "exp_30",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "add_153",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_865",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_153",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "div_30",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "convert_element_type_739",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_278",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "dtype_cast_278",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_278",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "permute_339",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_339",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "alias_default_867",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_862",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_867",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "einsum_default_215",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_739",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "alias_default_866",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_215",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "alias_default_868",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_866",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_868",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "mul_216",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_277",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "dtype_cast_279",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_279",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "permute_340",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_216",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "alias_default_869",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_340",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "alias_default_870",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_869",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_870",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "einsum_default_216",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_858",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_216",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30",
+      "name": "add_154",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 0,
+      "cluster_root": "dtype_cast_1",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_288",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "dtype_cast_280",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_154",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30",
+      "name": "alias_default_871",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 1,
+      "cluster_root": "convert_element_type",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_871",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "convert_element_type_744",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 2,
+      "cluster_root": "alias_default_5",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_744",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "alias_default_873",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 3,
+      "cluster_root": "pow_1",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_873",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "pow_63",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 4,
+      "cluster_root": "mean",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_63",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "mean_62",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 5,
+      "cluster_root": "add",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_62",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "add_155",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 6,
+      "cluster_root": "rsqrt",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_155",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "rsqrt_62",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 7,
+      "cluster_root": "alias_default_6",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_62",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "alias_default_874",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 8,
+      "cluster_root": "mul",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_873",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_874",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "mul_217",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 9,
+      "cluster_root": "alias_default_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_280",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "alias_default_872",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 10,
+      "cluster_root": "mul_1",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_217",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_872",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "mul_218",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 11,
+      "cluster_root": "convert_element_type_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_218",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "convert_element_type_745",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 12,
+      "cluster_root": "dtype_cast_2",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_281",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wq",
+      "name": "dtype_cast_281",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 13,
+      "cluster_root": "permute",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 93.01059422750424,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_281",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wq",
+      "name": "permute_341",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 14,
+      "cluster_root": "alias_default_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_745",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "alias_default_875",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 15,
+      "cluster_root": "alias_default_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_341",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wq",
+      "name": "alias_default_876",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 16,
+      "cluster_root": "einsum_default",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_875",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_876",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wq",
+      "name": "einsum_default_217",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 17,
+      "cluster_root": "dtype_cast_3",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_282",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wk",
+      "name": "dtype_cast_282",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 18,
+      "cluster_root": "permute_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 75.93123841862722,
+          "dst_placement": "RR",
+          "name": "dtype_cast_282",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wk",
+      "name": "permute_342",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 19,
+      "cluster_root": "alias_default_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_342",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wk",
+      "name": "alias_default_877",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 20,
+      "cluster_root": "einsum_default_1",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_875",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_877",
+          "src_placement": "RR",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wk",
+      "name": "einsum_default_218",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 21,
+      "cluster_root": "dtype_cast_4",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_283",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wv",
+      "name": "dtype_cast_283",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 22,
+      "cluster_root": "permute_2",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 39.60264855687606,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_283",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wv",
+      "name": "permute_343",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 23,
+      "cluster_root": "alias_default_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_343",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wv",
+      "name": "alias_default_878",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 24,
+      "cluster_root": "einsum_default_2",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_875",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_878",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wv",
+      "name": "einsum_default_219",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 25,
+      "cluster_root": "view_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_217",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_781",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 26,
+      "cluster_root": "view_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_218",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_782",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 27,
+      "cluster_root": "view_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_219",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_783",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 28,
+      "cluster_root": "convert_element_type_8",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_781",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "convert_element_type_752",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 29,
+      "cluster_root": "view_9",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_752",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_784",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 30,
+      "cluster_root": "view_as_complex",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_784",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_as_complex_62",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 31,
+      "cluster_root": "convert_element_type_9",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_782",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "convert_element_type_753",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 32,
+      "cluster_root": "view_10",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_753",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_785",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 33,
+      "cluster_root": "view_as_complex_1",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_785",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_as_complex_63",
+      "op": "aten.view_as_complex.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 34,
+      "cluster_root": "view_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_786",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 35,
+      "cluster_root": "alias_default_11",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "view_786",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "alias_default_879",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "return freqs_cis.view(*shape)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "reshape_for_broadcast",
+        "line": 183
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 36,
+      "cluster_root": "mul_2",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_62",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_879",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "mul_219",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 37,
+      "cluster_root": "view_as_real",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_219",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_as_real_62",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 38,
+      "cluster_root": "view_12",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_62",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_787",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 39,
+      "cluster_root": "mul_3",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_879",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "mul_220",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 40,
+      "cluster_root": "view_as_real_1",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_220",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_as_real_63",
+      "op": "aten.view_as_real.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 41,
+      "cluster_root": "view_13",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_788",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 42,
+      "cluster_root": "convert_element_type_10",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_787",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "convert_element_type_754",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 43,
+      "cluster_root": "convert_element_type_11",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_788",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "convert_element_type_755",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 44,
+      "cluster_root": "unsqueeze",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_755",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "unsqueeze_62",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 45,
+      "cluster_root": "expand",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_62",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "expand_62",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 46,
+      "cluster_root": "clone",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_62",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "clone_62",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 47,
+      "cluster_root": "view_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_62",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_789",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 48,
+      "cluster_root": "unsqueeze_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_783",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "unsqueeze_63",
+      "op": "aten.unsqueeze.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 49,
+      "cluster_root": "expand_1",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "unsqueeze_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "expand_63",
+      "op": "aten.expand.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 50,
+      "cluster_root": "clone_1",
+      "compute_cost": 26.027785181236673,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "expand_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "clone_63",
+      "op": "aten.clone.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 51,
+      "cluster_root": "view_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "clone_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_790",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 52,
+      "cluster_root": "permute_3",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_754",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "permute_344",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 53,
+      "cluster_root": "permute_4",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_789",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "permute_345",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 54,
+      "cluster_root": "permute_5",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_790",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "permute_346",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 55,
+      "cluster_root": "alias_default_12",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_344",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "alias_default_880",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 56,
+      "cluster_root": "alias_default_13",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_345",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "alias_default_881",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 57,
+      "cluster_root": "alias_default_14",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_346",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "alias_default_882",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 58,
+      "cluster_root": "_scaled_dot_product_flash_attention",
+      "compute_cost": 794.1005545110502,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_880",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_881",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_882",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_31",
+      "op": "aten._scaled_dot_product_flash_attention.default",
+      "phase": "forward",
+      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 59,
+      "cluster_root": "getitem",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.sdpa",
+      "name": "getitem_279",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.sdpa",
+      "name": "getitem_280",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_31",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.sdpa",
+      "name": "getitem_285",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        2
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "uint64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_scaled_dot_product_flash_attention_31",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.sdpa",
+      "name": "getitem_286",
+      "op": "<built-in function getitem>",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 60,
+      "cluster_root": "alias_default_15",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_279",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.sdpa",
+      "name": "alias_default_883",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 61,
+      "cluster_root": "permute_6",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_883",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "permute_347",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 62,
+      "cluster_root": "view_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_347",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_791",
+      "op": "aten.view.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 63,
+      "cluster_root": "dtype_cast_5",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_284",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "dtype_cast_284",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 64,
+      "cluster_root": "permute_7",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 187.32495367450883,
+          "dst_placement": "RR",
+          "name": "dtype_cast_284",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "permute_348",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 65,
+      "cluster_root": "alias_default_16",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "view_791",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "alias_default_884",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 66,
+      "cluster_root": "alias_default_17",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_348",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "alias_default_885",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 67,
+      "cluster_root": "einsum_default_3",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_884",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_885",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "einsum_default_220",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 68,
+      "cluster_root": "add_1",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_871",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_220",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31",
+      "name": "add_156",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 69,
+      "cluster_root": "dtype_cast_6",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_289",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "dtype_cast_285",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 70,
+      "cluster_root": "alias_default_18",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_156",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31",
+      "name": "alias_default_886",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 419
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 71,
+      "cluster_root": "convert_element_type_14",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_886",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "convert_element_type_758",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 72,
+      "cluster_root": "alias_default_20",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_758",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "alias_default_888",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 73,
+      "cluster_root": "pow_2",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_888",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "pow_64",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 74,
+      "cluster_root": "mean_1",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_64",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "mean_63",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 75,
+      "cluster_root": "add_2",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_63",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "add_157",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 76,
+      "cluster_root": "rsqrt_1",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_157",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "rsqrt_63",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 77,
+      "cluster_root": "alias_default_21",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_63",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "alias_default_889",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 78,
+      "cluster_root": "mul_4",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_888",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_889",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "mul_221",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 79,
+      "cluster_root": "alias_default_19",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_285",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "alias_default_887",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 80,
+      "cluster_root": "mul_5",
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_221",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_887",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "mul_222",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 81,
+      "cluster_root": "convert_element_type_15",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_222",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "convert_element_type_759",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 82,
+      "cluster_root": "dtype_cast_7",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_285",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "dtype_cast_286",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 83,
+      "cluster_root": "permute_8",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_286",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "permute_349",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 84,
+      "cluster_root": "alias_default_22",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_759",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "alias_default_890",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 85,
+      "cluster_root": "alias_default_23",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_349",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "alias_default_891",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 86,
+      "cluster_root": "einsum_default_4",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_890",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_891",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "einsum_default_221",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 87,
+      "cluster_root": "alias_default_24",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_221",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "alias_default_892",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 88,
+      "cluster_root": "convert_element_type_18",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_892",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "convert_element_type_762",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 89,
+      "cluster_root": "alias_default_25",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_762",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "alias_default_893",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 90,
+      "cluster_root": "neg",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_893",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "neg_31",
+      "op": "aten.neg.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 91,
+      "cluster_root": "exp",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "exp_31",
+      "op": "aten.exp.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 92,
+      "cluster_root": "add_3",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "add_158",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 93,
+      "cluster_root": "div",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_893",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_158",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "div_31",
+      "op": "aten.div.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 94,
+      "cluster_root": "convert_element_type_19",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "div_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "convert_element_type_763",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 95,
+      "cluster_root": "dtype_cast_8",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_287",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "dtype_cast_287",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 96,
+      "cluster_root": "permute_9",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_287",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "permute_350",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 97,
+      "cluster_root": "alias_default_27",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_350",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "alias_default_895",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 98,
+      "cluster_root": "einsum_default_5",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_890",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_895",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "einsum_default_222",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 99,
+      "cluster_root": "alias_default_26",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_763",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "alias_default_894",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 100,
+      "cluster_root": "alias_default_28",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_222",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "alias_default_896",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 101,
+      "cluster_root": "mul_6",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_894",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_896",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "mul_223",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 102,
+      "cluster_root": "dtype_cast_9",
+      "compute_cost": 8.540367012593283,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "primals_286",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "dtype_cast_288",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 103,
+      "cluster_root": "permute_10",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 258.576,
+          "dst_placement": "RS(1)",
+          "name": "dtype_cast_288",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "permute_351",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 104,
+      "cluster_root": "alias_default_29",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_223",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "alias_default_897",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 105,
+      "cluster_root": "alias_default_30",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_351",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "alias_default_898",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 106,
+      "cluster_root": "einsum_default_6",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_897",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_898",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "einsum_default_223",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 107,
+      "cluster_root": "add_4",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_886",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_223",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31",
+      "name": "add_159",
+      "op": "aten.add.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_290",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "dtype_cast_289",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 108,
+      "cluster_root": "alias_default_31",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_159",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31",
+      "name": "alias_default_899",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 420
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_899",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "convert_element_type_768",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_768",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "alias_default_901",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_901",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "pow_65",
+      "op": "aten.pow.Tensor_Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "pow_65",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "mean_64",
+      "op": "aten.mean.dim",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mean_64",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "add_160",
+      "op": "aten.add.Scalar",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_160",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "rsqrt_64",
+      "op": "aten.rsqrt.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "rsqrt_64",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "alias_default_902",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_901",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_902",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "mul_224",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 28.358260191421483,
+          "dst_placement": "RR",
+          "name": "dtype_cast_289",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "alias_default_900",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 52.058747582344104,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_224",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_900",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "mul_225",
+      "op": "aten.mul.Tensor",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_225",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "convert_element_type_769",
+      "op": "prims.convert_element_type.default",
+      "phase": "forward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 76.40578345195063,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(0)",
+          "name": "primals_291",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "dtype_cast_290",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "forward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 2081.296,
+          "dst_placement": "RS(0)",
+          "name": "dtype_cast_290",
+          "src_placement": "S(0)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "permute_352",
+      "op": "aten.permute.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        128256
+      ],
+      "source": {
+        "code": "output = self.output(h) if self.output else h",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 545
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_769",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "alias_default_903",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_352",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "alias_default_904",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        128256
+      ],
+      "source": {
+        "code": "output = self.output(h) if self.output else h",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 545
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 6216.318403281814,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_903",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_904",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "einsum_default_224",
+      "op": "aten.einsum.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        128256
+      ],
+      "source": {
+        "code": "output = self.output(h) if self.output else h",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 545
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_224",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "alias_default_1420",
+      "op": "aten.alias.default",
+      "phase": "forward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        128256
+      ],
+      "source": {
+        "code": "output = self.output(h) if self.output else h",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 545
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "tangents_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "alias_default_2",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        128256
+      ],
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 6216.318403281814,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_903",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "einsum_default_225",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        128256
+      ],
+      "source": {
+        "code": "output = self.output(h) if self.output else h",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 545
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_904",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "permute_355",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "source": {
+        "code": "output = self.output(h) if self.output else h",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 545
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 6216.318403281814,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_355",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "einsum_default_226",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "output = self.output(h) if self.output else h",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 545
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_225",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "permute_356",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "source": {
+        "code": "output = self.output(h) if self.output else h",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 545
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 305.6231338078025,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_356",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "dtype_cast_291",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 4133.392,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_291",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].output",
+      "name": "alias_default_1711",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_226",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "convert_element_type_776",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_899",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "convert_element_type_777",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_900",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "convert_element_type_778",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_776",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "alias_default_905",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_905",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_778",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "mul_226",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_777",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_902",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "mul_227",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_226",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "alias_default_906",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_227",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "alias_default_907",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_907",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_906",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "mul_228",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_228",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "sum_1",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_907",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "div_32",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_32",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "mul_229",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_906",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_229",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "sub",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_902",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "mul_230",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_905",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_907",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "mul_231",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_231",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "sum_2",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_230",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "convert_element_type_779",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_2",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "convert_element_type_780",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_780",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "dtype_cast_292",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_292",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "alias_default_1710",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "convert_element_type_779",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].norm",
+      "name": "alias_default_908",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_908",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_897",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "einsum_default_227",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_898",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "permute_359",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_908",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_359",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "einsum_default_228",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_227",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "permute_360",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_360",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "dtype_cast_293",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_293",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "alias_default_1706",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_228",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w2",
+      "name": "alias_default_909",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_909",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_894",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "mul_232",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_909",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_896",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "mul_233",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_232",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "alias_default_910",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_910",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_890",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "einsum_default_229",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_895",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "permute_363",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_910",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_363",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "einsum_default_230",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_229",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "permute_364",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_364",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "dtype_cast_294",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_294",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w3",
+      "name": "alias_default_1707",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_233",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "convert_element_type_789",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_892",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "convert_element_type_790",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_790",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "alias_default_911",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_911",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "neg_32",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "exp_32",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "add_161",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_161",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "reciprocal",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "mul_234",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_234",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "alias_default_912",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_789",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_912",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "mul_235",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_912",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "sub_1",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_911",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "mul_236",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_236",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "add_162",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_235",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_162",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "mul_237",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_237",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "convert_element_type_791",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_791",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward",
+      "name": "alias_default_913",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_913",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_890",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "einsum_default_231",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_891",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "permute_367",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_913",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_367",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "einsum_default_232",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_230",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_232",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31",
+      "name": "add_163",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_231",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "permute_368",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_368",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "dtype_cast_295",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_295",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.feed_forward.w1",
+      "name": "alias_default_1705",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_163",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "convert_element_type_796",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_886",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "convert_element_type_797",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_887",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "convert_element_type_798",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_796",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "alias_default_914",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_914",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_798",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "mul_238",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_797",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_889",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "mul_239",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_238",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "alias_default_915",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_239",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "alias_default_916",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_916",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_915",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "mul_240",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_240",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "sum_3",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_916",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "div_33",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_33",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "mul_241",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_915",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_241",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "sub_2",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_2",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_889",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "mul_242",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_914",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_916",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "mul_243",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_243",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "sum_4",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_242",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "convert_element_type_799",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_4",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "convert_element_type_800",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_908",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_799",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "name": "add_164",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_800",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "dtype_cast_296",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_296",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.ffn_norm",
+      "name": "alias_default_1709",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_164",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "alias_default_917",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_917",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_884",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "einsum_default_233",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_885",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "permute_371",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_917",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_371",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "einsum_default_234",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_233",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "permute_372",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_372",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "dtype_cast_297",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_297",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wo",
+      "name": "alias_default_1704",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_234",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_812",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_812",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "permute_373",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_373",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_880",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_881",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_882",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_883",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_280",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_285",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_286",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.sdpa",
+      "name": "getitem_288",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.sdpa",
+      "name": "getitem_289",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.sdpa",
+      "name": "getitem_290",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_290",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "permute_374",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_289",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "permute_375",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_288",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "permute_376",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_374",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_813",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_813",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "sum_5",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "squeeze",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_375",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_814",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_814",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "sum_6",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "squeeze_1",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "convert_element_type_805",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_376",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "convert_element_type_806",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_805",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_815",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_815",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_as_complex_64",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_879",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "_conj",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "clone_70",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_64",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_70",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "mul_244",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_806",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_816",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_816",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_as_complex_65",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_879",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "_conj_1",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_1",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "clone_71",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_65",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_71",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "mul_245",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_244",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_as_real_64",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_64",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_817",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_817",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "convert_element_type_807",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_245",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_as_real_65",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_65",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_818",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_818",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "convert_element_type_808",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_819",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_807",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_820",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_808",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "view_821",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_819",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "alias_default_918",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_918",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_875",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wv",
+      "name": "einsum_default_235",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_878",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wv",
+      "name": "permute_379",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_918",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_379",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wv",
+      "name": "einsum_default_236",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_235",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wv",
+      "name": "permute_380",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_380",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wv",
+      "name": "dtype_cast_298",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_298",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wv",
+      "name": "alias_default_1703",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_820",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "alias_default_919",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_919",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_875",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wk",
+      "name": "einsum_default_237",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_877",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wk",
+      "name": "permute_383",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_919",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_383",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wk",
+      "name": "einsum_default_238",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_236",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_238",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "add_165",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_237",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wk",
+      "name": "permute_384",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_384",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wk",
+      "name": "dtype_cast_299",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_299",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wk",
+      "name": "alias_default_1702",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_821",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention",
+      "name": "alias_default_920",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_920",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_875",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wq",
+      "name": "einsum_default_239",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_876",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wq",
+      "name": "permute_387",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_920",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_387",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wq",
+      "name": "einsum_default_240",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_165",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_240",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31",
+      "name": "add_166",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_239",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wq",
+      "name": "permute_388",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_388",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wq",
+      "name": "dtype_cast_300",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_300",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention.wq",
+      "name": "alias_default_1701",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_166",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "convert_element_type_821",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_871",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "convert_element_type_822",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_872",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "convert_element_type_823",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_821",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "alias_default_921",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_921",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_823",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "mul_246",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_822",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_874",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "mul_247",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_246",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "alias_default_922",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_247",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "alias_default_923",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_923",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_922",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "mul_248",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_248",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "sum_7",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_923",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "div_34",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_34",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "mul_249",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_922",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_249",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "sub_3",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_874",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "mul_250",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_921",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_923",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "mul_251",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_251",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "sum_8",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_250",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "convert_element_type_824",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_8",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "convert_element_type_825",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_917",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_824",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "add_167",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_825",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "dtype_cast_301",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_301",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.31.attention_norm",
+      "name": "alias_default_1708",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_167",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "alias_default_924",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_924",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_869",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "einsum_default_241",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_870",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "permute_391",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_924",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_391",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "einsum_default_242",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_241",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "permute_392",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_392",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "dtype_cast_302",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_302",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "alias_default_1697",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_242",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w2",
+      "name": "alias_default_925",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_925",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_866",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "mul_252",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_925",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_868",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "mul_253",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_252",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "alias_default_926",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_926",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_862",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "einsum_default_243",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_867",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "permute_395",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_926",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_395",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "einsum_default_244",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_243",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "permute_396",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_396",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "dtype_cast_303",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_303",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w3",
+      "name": "alias_default_1698",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_253",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "convert_element_type_834",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_864",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "convert_element_type_835",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_835",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "alias_default_927",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_927",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "neg_33",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "exp_33",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "add_168",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_168",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "reciprocal_1",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_1",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "mul_254",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_254",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "alias_default_928",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_834",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_928",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "mul_255",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_928",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "sub_4",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_927",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "mul_256",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_256",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "add_169",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_255",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_169",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "mul_257",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_257",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "convert_element_type_836",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_836",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward",
+      "name": "alias_default_929",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_929",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_862",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "einsum_default_245",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_863",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "permute_399",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_929",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_399",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "einsum_default_246",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_244",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_246",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30",
+      "name": "add_170",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_245",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "permute_400",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_400",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "dtype_cast_304",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_304",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.feed_forward.w1",
+      "name": "alias_default_1696",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_170",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "convert_element_type_841",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_858",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "convert_element_type_842",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_859",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "convert_element_type_843",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_841",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "alias_default_930",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_930",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_843",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "mul_258",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_842",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_861",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "mul_259",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_258",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "alias_default_931",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_259",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "alias_default_932",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_932",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_931",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "mul_260",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_260",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "sum_9",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_932",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "div_35",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_35",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "mul_261",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_931",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_261",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "sub_5",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_861",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "mul_262",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_930",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_932",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "mul_263",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_263",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "sum_10",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_262",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "convert_element_type_844",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_10",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "convert_element_type_845",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_924",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_844",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "add_171",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_845",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "dtype_cast_305",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_305",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.ffn_norm",
+      "name": "alias_default_1700",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "alias_default_933",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_933",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_856",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "einsum_default_247",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_857",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "permute_403",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_933",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_403",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "einsum_default_248",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_247",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "permute_404",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_404",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "dtype_cast_306",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_306",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wo",
+      "name": "alias_default_1695",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_248",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_836",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_836",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "permute_405",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_405",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_852",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_853",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_854",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_855",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_271",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_276",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_277",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_1",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.sdpa",
+      "name": "getitem_291",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.sdpa",
+      "name": "getitem_292",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.sdpa",
+      "name": "getitem_293",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_293",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "permute_406",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_292",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "permute_407",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_291",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "permute_408",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_406",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_837",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_837",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "sum_11",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "squeeze_2",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_407",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_838",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_838",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "sum_12",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "squeeze_3",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "convert_element_type_850",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_408",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "convert_element_type_851",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_850",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_839",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_839",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_as_complex_66",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_851",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "_conj_2",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_2",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "clone_78",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_66",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_78",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "mul_264",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_851",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_840",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_840",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_as_complex_67",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_851",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "_conj_3",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_3",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "clone_79",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_67",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_79",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "mul_265",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_264",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_as_real_66",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_66",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_841",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_841",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "convert_element_type_852",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_265",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_as_real_67",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_67",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_842",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_842",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "convert_element_type_853",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_843",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_852",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_844",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_853",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "view_845",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_843",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "alias_default_934",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_934",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_847",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wv",
+      "name": "einsum_default_249",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_850",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wv",
+      "name": "permute_411",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_934",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_411",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wv",
+      "name": "einsum_default_250",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_249",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wv",
+      "name": "permute_412",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_412",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wv",
+      "name": "dtype_cast_307",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_307",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wv",
+      "name": "alias_default_1694",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_844",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "alias_default_935",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_935",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_847",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wk",
+      "name": "einsum_default_251",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_849",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wk",
+      "name": "permute_415",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_935",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_415",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wk",
+      "name": "einsum_default_252",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_250",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_252",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "add_172",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_251",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wk",
+      "name": "permute_416",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_416",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wk",
+      "name": "dtype_cast_308",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_308",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wk",
+      "name": "alias_default_1693",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_845",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention",
+      "name": "alias_default_936",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_936",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_847",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wq",
+      "name": "einsum_default_253",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_848",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wq",
+      "name": "permute_419",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_936",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_419",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wq",
+      "name": "einsum_default_254",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_172",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_254",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30",
+      "name": "add_173",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_253",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wq",
+      "name": "permute_420",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_420",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wq",
+      "name": "dtype_cast_309",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_309",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention.wq",
+      "name": "alias_default_1692",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_173",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "convert_element_type_866",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_843",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "convert_element_type_867",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_844",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "convert_element_type_868",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_866",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "alias_default_937",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_937",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_868",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "mul_266",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_867",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_846",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "mul_267",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_266",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "alias_default_938",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_267",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "alias_default_939",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_939",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_938",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "mul_268",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_268",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "sum_13",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_939",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "div_36",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_36",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "mul_269",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_938",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_269",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "sub_6",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_846",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "mul_270",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_937",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_939",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "mul_271",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_271",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "sum_14",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_270",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "convert_element_type_869",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_14",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "convert_element_type_870",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_933",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_869",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "add_174",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_870",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "dtype_cast_310",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_310",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.30.attention_norm",
+      "name": "alias_default_1699",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_174",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "alias_default_940",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_940",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_841",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "einsum_default_255",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_842",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "permute_423",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_940",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_423",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "einsum_default_256",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_255",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "permute_424",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_424",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "dtype_cast_311",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_311",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "alias_default_1688",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_256",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w2",
+      "name": "alias_default_941",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_941",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_838",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "mul_272",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_941",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_840",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "mul_273",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_272",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "alias_default_942",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_942",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_834",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "einsum_default_257",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_839",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "permute_427",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_942",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_427",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "einsum_default_258",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_257",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "permute_428",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_428",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "dtype_cast_312",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_312",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w3",
+      "name": "alias_default_1689",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_273",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "convert_element_type_879",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_836",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "convert_element_type_880",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_880",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "alias_default_943",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_943",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "neg_34",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "exp_34",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "add_175",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_175",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "reciprocal_2",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_2",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "mul_274",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_274",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "alias_default_944",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_879",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_944",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "mul_275",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_944",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "sub_7",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_943",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "mul_276",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_276",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "add_176",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_275",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_176",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "mul_277",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_277",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "convert_element_type_881",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_881",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward",
+      "name": "alias_default_945",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_945",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_834",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "einsum_default_259",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_835",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "permute_431",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_945",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_431",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "einsum_default_260",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_258",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_260",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29",
+      "name": "add_177",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_259",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "permute_432",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_432",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "dtype_cast_313",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_313",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.feed_forward.w1",
+      "name": "alias_default_1687",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_177",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "convert_element_type_886",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_830",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "convert_element_type_887",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_831",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "convert_element_type_888",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_886",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "alias_default_946",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_946",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_888",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "mul_278",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_887",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_833",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "mul_279",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_278",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "alias_default_947",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_279",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "alias_default_948",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_948",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_947",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "mul_280",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_280",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "sum_15",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_948",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "div_37",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_37",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "mul_281",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_947",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_281",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "sub_8",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_8",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_833",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "mul_282",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_946",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_948",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "mul_283",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_283",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "sum_16",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_282",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "convert_element_type_889",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_16",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "convert_element_type_890",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_940",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_889",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "add_178",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_890",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "dtype_cast_314",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_314",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.ffn_norm",
+      "name": "alias_default_1691",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_178",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "alias_default_949",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_949",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_828",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "einsum_default_261",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_829",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "permute_435",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_949",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_435",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "einsum_default_262",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_261",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "permute_436",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_436",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "dtype_cast_315",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_315",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wo",
+      "name": "alias_default_1686",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_262",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_860",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_860",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "permute_437",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_437",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_824",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_825",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_826",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_827",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_262",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_267",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_268",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_2",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_2",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.sdpa",
+      "name": "getitem_294",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_2",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.sdpa",
+      "name": "getitem_295",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_2",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.sdpa",
+      "name": "getitem_296",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_296",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "permute_438",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_295",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "permute_439",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_294",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "permute_440",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_438",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_861",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_861",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "sum_17",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "squeeze_4",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_439",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_862",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_862",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "sum_18",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "squeeze_5",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "convert_element_type_895",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_440",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "convert_element_type_896",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_895",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_863",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_863",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_as_complex_68",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_823",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "_conj_4",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_4",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "clone_86",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_68",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_86",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "mul_284",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_896",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_864",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_864",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_as_complex_69",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_823",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "_conj_5",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_5",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "clone_87",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_69",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_87",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "mul_285",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_284",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_as_real_68",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_68",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_865",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_865",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "convert_element_type_897",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_285",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_as_real_69",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_69",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_866",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_866",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "convert_element_type_898",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_867",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_897",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_868",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_898",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "view_869",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_867",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "alias_default_950",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_950",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_819",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wv",
+      "name": "einsum_default_263",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_822",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wv",
+      "name": "permute_443",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_950",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_443",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wv",
+      "name": "einsum_default_264",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_263",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wv",
+      "name": "permute_444",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_444",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wv",
+      "name": "dtype_cast_316",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_316",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wv",
+      "name": "alias_default_1685",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_868",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "alias_default_951",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_951",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_819",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wk",
+      "name": "einsum_default_265",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_821",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wk",
+      "name": "permute_447",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_951",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_447",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wk",
+      "name": "einsum_default_266",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_264",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_266",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "add_179",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_265",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wk",
+      "name": "permute_448",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_448",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wk",
+      "name": "dtype_cast_317",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_317",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wk",
+      "name": "alias_default_1684",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_869",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention",
+      "name": "alias_default_952",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_952",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_819",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wq",
+      "name": "einsum_default_267",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_820",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wq",
+      "name": "permute_451",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_952",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_451",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wq",
+      "name": "einsum_default_268",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_179",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_268",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29",
+      "name": "add_180",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_267",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wq",
+      "name": "permute_452",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_452",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wq",
+      "name": "dtype_cast_318",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_318",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention.wq",
+      "name": "alias_default_1683",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_180",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "convert_element_type_911",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_815",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "convert_element_type_912",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_816",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "convert_element_type_913",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_911",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "alias_default_953",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_953",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_913",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "mul_286",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_912",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_818",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "mul_287",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_286",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "alias_default_954",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_287",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "alias_default_955",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_955",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_954",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "mul_288",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_288",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "sum_19",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_955",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "div_38",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_38",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "mul_289",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_954",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_289",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "sub_9",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_818",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "mul_290",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_953",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_955",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "mul_291",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_291",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "sum_20",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_290",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "convert_element_type_914",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_20",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "convert_element_type_915",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_949",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_914",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "add_181",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_915",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "dtype_cast_319",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_319",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.29.attention_norm",
+      "name": "alias_default_1690",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_181",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "alias_default_956",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_956",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_813",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "einsum_default_269",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_814",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "permute_455",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_956",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_455",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "einsum_default_270",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_269",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "permute_456",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_456",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "dtype_cast_320",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_320",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "alias_default_1679",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_270",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w2",
+      "name": "alias_default_957",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_957",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_810",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "mul_292",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_957",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_812",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "mul_293",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_292",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "alias_default_958",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_958",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_806",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "einsum_default_271",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_811",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "permute_459",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_958",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_459",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "einsum_default_272",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_271",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "permute_460",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_460",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "dtype_cast_321",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_321",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w3",
+      "name": "alias_default_1680",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_293",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "convert_element_type_924",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_808",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "convert_element_type_925",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_925",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "alias_default_959",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_959",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "neg_35",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "exp_35",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "add_182",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_182",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "reciprocal_3",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_3",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "mul_294",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_294",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "alias_default_960",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_924",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_960",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "mul_295",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_960",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "sub_10",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_959",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "mul_296",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_296",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "add_183",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_295",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_183",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "mul_297",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_297",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "convert_element_type_926",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_926",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward",
+      "name": "alias_default_961",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_961",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_806",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "einsum_default_273",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_807",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "permute_463",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_961",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_463",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "einsum_default_274",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_272",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_274",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28",
+      "name": "add_184",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_273",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "permute_464",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_464",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "dtype_cast_322",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_322",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.feed_forward.w1",
+      "name": "alias_default_1678",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_184",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "convert_element_type_931",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_802",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "convert_element_type_932",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_803",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "convert_element_type_933",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_931",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "alias_default_962",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_962",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_933",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "mul_298",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_932",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_805",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "mul_299",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_298",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "alias_default_963",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_299",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "alias_default_964",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_964",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_963",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "mul_300",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_300",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "sum_21",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_964",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "div_39",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_39",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "mul_301",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_963",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_301",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "sub_11",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_805",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "mul_302",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_962",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_964",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "mul_303",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_303",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "sum_22",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_302",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "convert_element_type_934",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_22",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "convert_element_type_935",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_956",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_934",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "add_185",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_935",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "dtype_cast_323",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_323",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.ffn_norm",
+      "name": "alias_default_1682",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_185",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "alias_default_965",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_965",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_800",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "einsum_default_275",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_801",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "permute_467",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_965",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_467",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "einsum_default_276",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_275",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "permute_468",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_468",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "dtype_cast_324",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_324",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wo",
+      "name": "alias_default_1677",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_276",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_884",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_884",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "permute_469",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_469",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_796",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_797",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_798",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_799",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_253",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_258",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_259",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_3",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.sdpa",
+      "name": "getitem_297",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.sdpa",
+      "name": "getitem_298",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.sdpa",
+      "name": "getitem_299",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_299",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "permute_470",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_298",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "permute_471",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_297",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "permute_472",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_470",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_885",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_885",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "sum_23",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "squeeze_6",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_471",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_886",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_886",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "sum_24",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "squeeze_7",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "convert_element_type_940",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_472",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "convert_element_type_941",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_940",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_887",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_887",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_as_complex_70",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_795",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "_conj_6",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_6",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "clone_94",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_70",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_94",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "mul_304",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_941",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_888",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_888",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_as_complex_71",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_795",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "_conj_7",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_7",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "clone_95",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_71",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_95",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "mul_305",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_304",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_as_real_70",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_70",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_889",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_889",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "convert_element_type_942",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_305",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_as_real_71",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_71",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_890",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_890",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "convert_element_type_943",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_891",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_942",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_892",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_943",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "view_893",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_891",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "alias_default_966",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_966",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_791",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wv",
+      "name": "einsum_default_277",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_794",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wv",
+      "name": "permute_475",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_966",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_475",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wv",
+      "name": "einsum_default_278",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_277",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wv",
+      "name": "permute_476",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_476",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wv",
+      "name": "dtype_cast_325",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_325",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wv",
+      "name": "alias_default_1676",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_892",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "alias_default_967",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_967",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_791",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wk",
+      "name": "einsum_default_279",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_793",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wk",
+      "name": "permute_479",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_967",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_479",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wk",
+      "name": "einsum_default_280",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_278",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_280",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "add_186",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_279",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wk",
+      "name": "permute_480",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_480",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wk",
+      "name": "dtype_cast_326",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_326",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wk",
+      "name": "alias_default_1675",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_893",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention",
+      "name": "alias_default_968",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_968",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_791",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wq",
+      "name": "einsum_default_281",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_792",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wq",
+      "name": "permute_483",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_968",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_483",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wq",
+      "name": "einsum_default_282",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_186",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_282",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28",
+      "name": "add_187",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_281",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wq",
+      "name": "permute_484",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_484",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wq",
+      "name": "dtype_cast_327",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_327",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention.wq",
+      "name": "alias_default_1674",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_187",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "convert_element_type_956",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_787",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "convert_element_type_957",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_788",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "convert_element_type_958",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_956",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "alias_default_969",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_969",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_958",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "mul_306",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_957",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_790",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "mul_307",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_306",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "alias_default_970",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_307",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "alias_default_971",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_971",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_970",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "mul_308",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_308",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "sum_25",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_971",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "div_40",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_40",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "mul_309",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_970",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_309",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "sub_12",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_790",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "mul_310",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_969",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_971",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "mul_311",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_311",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "sum_26",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_310",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "convert_element_type_959",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_26",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "convert_element_type_960",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_965",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_959",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "add_188",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_960",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "dtype_cast_328",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_328",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.28.attention_norm",
+      "name": "alias_default_1681",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_188",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "alias_default_972",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_972",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_785",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "einsum_default_283",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_786",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "permute_487",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_972",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_487",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "einsum_default_284",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_283",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "permute_488",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_488",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "dtype_cast_329",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_329",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "alias_default_1670",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_284",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w2",
+      "name": "alias_default_973",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_973",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_782",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "mul_312",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_973",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_784",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "mul_313",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_312",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "alias_default_974",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_974",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_778",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "einsum_default_285",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_783",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "permute_491",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_974",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_491",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "einsum_default_286",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_285",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "permute_492",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_492",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "dtype_cast_330",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_330",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w3",
+      "name": "alias_default_1671",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_313",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "convert_element_type_969",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_780",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "convert_element_type_970",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_970",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "alias_default_975",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_975",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "neg_36",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_36",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "exp_36",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_36",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "add_189",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_189",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "reciprocal_4",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_4",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "mul_314",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_314",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "alias_default_976",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_969",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_976",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "mul_315",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_976",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "sub_13",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_975",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "mul_316",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_316",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "add_190",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_315",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_190",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "mul_317",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_317",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "convert_element_type_971",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_971",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward",
+      "name": "alias_default_977",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_977",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_778",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "einsum_default_287",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_779",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "permute_495",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_977",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_495",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "einsum_default_288",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_286",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_288",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27",
+      "name": "add_191",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_287",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "permute_496",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_496",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "dtype_cast_331",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_331",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.feed_forward.w1",
+      "name": "alias_default_1669",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_191",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "convert_element_type_976",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_774",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "convert_element_type_977",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_775",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "convert_element_type_978",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_976",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "alias_default_978",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_978",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_978",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "mul_318",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_977",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_777",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "mul_319",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_318",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "alias_default_979",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_319",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "alias_default_980",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_980",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_979",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "mul_320",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_320",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "sum_27",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_980",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "div_41",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_41",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "mul_321",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_979",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_321",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "sub_14",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_777",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "mul_322",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_978",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_980",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "mul_323",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_323",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "sum_28",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_322",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "convert_element_type_979",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_28",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "convert_element_type_980",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_972",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_979",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "add_192",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_980",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "dtype_cast_332",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_332",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.ffn_norm",
+      "name": "alias_default_1673",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_192",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "alias_default_981",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_981",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_772",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "einsum_default_289",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_773",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "permute_499",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_981",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_499",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "einsum_default_290",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_289",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "permute_500",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_500",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "dtype_cast_333",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_333",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wo",
+      "name": "alias_default_1668",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_290",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_908",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_908",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "permute_501",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_501",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_768",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_769",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_770",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_771",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_244",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_249",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_250",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_4",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.sdpa",
+      "name": "getitem_300",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.sdpa",
+      "name": "getitem_301",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_4",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.sdpa",
+      "name": "getitem_302",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_302",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "permute_502",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_301",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "permute_503",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_300",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "permute_504",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_502",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_909",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_909",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "sum_29",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "squeeze_8",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_503",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_910",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_910",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "sum_30",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "squeeze_9",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "convert_element_type_985",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_504",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "convert_element_type_986",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_985",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_911",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_911",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_as_complex_72",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_767",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "_conj_8",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_8",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "clone_102",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_72",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_102",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "mul_324",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_986",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_912",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_912",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_as_complex_73",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_767",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "_conj_9",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_9",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "clone_103",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_73",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_103",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "mul_325",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_324",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_as_real_72",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_72",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_913",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_913",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "convert_element_type_987",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_325",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_as_real_73",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_73",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_914",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_914",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "convert_element_type_988",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_915",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_987",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_916",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_988",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "view_917",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_915",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "alias_default_982",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_982",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_763",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wv",
+      "name": "einsum_default_291",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_766",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wv",
+      "name": "permute_507",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_982",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_507",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wv",
+      "name": "einsum_default_292",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_291",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wv",
+      "name": "permute_508",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_508",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wv",
+      "name": "dtype_cast_334",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_334",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wv",
+      "name": "alias_default_1667",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_916",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "alias_default_983",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_983",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_763",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wk",
+      "name": "einsum_default_293",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_765",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wk",
+      "name": "permute_511",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_983",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_511",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wk",
+      "name": "einsum_default_294",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_292",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_294",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "add_193",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_293",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wk",
+      "name": "permute_512",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_512",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wk",
+      "name": "dtype_cast_335",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_335",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wk",
+      "name": "alias_default_1666",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_917",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention",
+      "name": "alias_default_984",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_984",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_763",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wq",
+      "name": "einsum_default_295",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_764",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wq",
+      "name": "permute_515",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_984",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_515",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wq",
+      "name": "einsum_default_296",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_193",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_296",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27",
+      "name": "add_194",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_295",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wq",
+      "name": "permute_516",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_516",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wq",
+      "name": "dtype_cast_336",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_336",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention.wq",
+      "name": "alias_default_1665",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_194",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "convert_element_type_1001",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_759",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "convert_element_type_1002",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_760",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "convert_element_type_1003",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1001",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "alias_default_985",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_985",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1003",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "mul_326",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1002",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_762",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "mul_327",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_326",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "alias_default_986",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_327",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "alias_default_987",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_987",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_986",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "mul_328",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_328",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "sum_31",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_987",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "div_42",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_42",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "mul_329",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_986",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_329",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "sub_15",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_762",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "mul_330",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_985",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_987",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "mul_331",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_331",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "sum_32",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_330",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "convert_element_type_1004",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_32",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "convert_element_type_1005",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_981",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1004",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "add_195",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1005",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "dtype_cast_337",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_337",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.27.attention_norm",
+      "name": "alias_default_1672",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_195",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "alias_default_988",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_988",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_757",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "einsum_default_297",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_758",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "permute_519",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_988",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_519",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "einsum_default_298",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_297",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "permute_520",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_520",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "dtype_cast_338",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_338",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "alias_default_1661",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_298",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w2",
+      "name": "alias_default_989",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_989",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_754",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "mul_332",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_989",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_756",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "mul_333",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_332",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "alias_default_990",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_990",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_750",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "einsum_default_299",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_755",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "permute_523",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_990",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_523",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "einsum_default_300",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_299",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "permute_524",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_524",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "dtype_cast_339",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_339",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w3",
+      "name": "alias_default_1662",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_333",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "convert_element_type_1014",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_752",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "convert_element_type_1015",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1015",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "alias_default_991",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_991",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "neg_37",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "exp_37",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "add_196",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_196",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "reciprocal_5",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_5",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "mul_334",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_334",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "alias_default_992",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1014",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_992",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "mul_335",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_992",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "sub_16",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_991",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "mul_336",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_336",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "add_197",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_335",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_197",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "mul_337",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_337",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "convert_element_type_1016",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1016",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward",
+      "name": "alias_default_993",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_993",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_750",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "einsum_default_301",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_751",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "permute_527",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_993",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_527",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "einsum_default_302",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_300",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_302",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26",
+      "name": "add_198",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_301",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "permute_528",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_528",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "dtype_cast_340",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_340",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.feed_forward.w1",
+      "name": "alias_default_1660",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_198",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "convert_element_type_1021",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_746",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "convert_element_type_1022",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_747",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "convert_element_type_1023",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1021",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "alias_default_994",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_994",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1023",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "mul_338",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1022",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_749",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "mul_339",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_338",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "alias_default_995",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_339",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "alias_default_996",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_996",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_995",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "mul_340",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_340",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "sum_33",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_996",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "div_43",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_43",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_33",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "mul_341",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_995",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_341",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "sub_17",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_749",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "mul_342",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_994",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_996",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "mul_343",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_343",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "sum_34",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_342",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "convert_element_type_1024",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_34",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "convert_element_type_1025",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_988",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1024",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "add_199",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1025",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "dtype_cast_341",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_341",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.ffn_norm",
+      "name": "alias_default_1664",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_199",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "alias_default_997",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_997",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_744",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "einsum_default_303",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_745",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "permute_531",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_997",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_531",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "einsum_default_304",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_303",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "permute_532",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_532",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "dtype_cast_342",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_342",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wo",
+      "name": "alias_default_1659",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_304",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_932",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_932",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "permute_533",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_533",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_740",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_741",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_742",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_743",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_235",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_240",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_241",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_5",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.sdpa",
+      "name": "getitem_303",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.sdpa",
+      "name": "getitem_304",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_5",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.sdpa",
+      "name": "getitem_305",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_305",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "permute_534",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_304",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "permute_535",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_303",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "permute_536",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_534",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_933",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_933",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "sum_35",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "squeeze_10",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_535",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_934",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_934",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "sum_36",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_36",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "squeeze_11",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "convert_element_type_1030",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_536",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "convert_element_type_1031",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1030",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_935",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_935",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_as_complex_74",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_739",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "_conj_10",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_10",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "clone_110",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_74",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_110",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "mul_344",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1031",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_936",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_936",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_as_complex_75",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_739",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "_conj_11",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_11",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "clone_111",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_75",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_111",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "mul_345",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_344",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_as_real_74",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_74",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_937",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_937",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "convert_element_type_1032",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_345",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_as_real_75",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_75",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_938",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_938",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "convert_element_type_1033",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_939",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1032",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_940",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1033",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "view_941",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_939",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "alias_default_998",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_998",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_735",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wv",
+      "name": "einsum_default_305",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_738",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wv",
+      "name": "permute_539",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_998",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_539",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wv",
+      "name": "einsum_default_306",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_305",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wv",
+      "name": "permute_540",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_540",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wv",
+      "name": "dtype_cast_343",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_343",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wv",
+      "name": "alias_default_1658",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_940",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "alias_default_999",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_999",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_735",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wk",
+      "name": "einsum_default_307",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_737",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wk",
+      "name": "permute_543",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_999",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_543",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wk",
+      "name": "einsum_default_308",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_306",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_308",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "add_200",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_307",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wk",
+      "name": "permute_544",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_544",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wk",
+      "name": "dtype_cast_344",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_344",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wk",
+      "name": "alias_default_1657",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_941",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention",
+      "name": "alias_default_1000",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1000",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_735",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wq",
+      "name": "einsum_default_309",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_736",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wq",
+      "name": "permute_547",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1000",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_547",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wq",
+      "name": "einsum_default_310",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_200",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_310",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26",
+      "name": "add_201",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_309",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wq",
+      "name": "permute_548",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_548",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wq",
+      "name": "dtype_cast_345",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_345",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention.wq",
+      "name": "alias_default_1656",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_201",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "convert_element_type_1046",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_731",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "convert_element_type_1047",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_732",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "convert_element_type_1048",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1046",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "alias_default_1001",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1001",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1048",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "mul_346",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1047",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_734",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "mul_347",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_346",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "alias_default_1002",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_347",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "alias_default_1003",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1003",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1002",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "mul_348",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_348",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "sum_37",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1003",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "div_44",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_44",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_37",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "mul_349",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1002",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_349",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "sub_18",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_734",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "mul_350",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1001",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1003",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "mul_351",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_351",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "sum_38",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_350",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "convert_element_type_1049",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_38",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "convert_element_type_1050",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_997",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1049",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "add_202",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1050",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "dtype_cast_346",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_346",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.26.attention_norm",
+      "name": "alias_default_1663",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_202",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "alias_default_1004",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1004",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_729",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "einsum_default_311",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_730",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "permute_551",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1004",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_551",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "einsum_default_312",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_311",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "permute_552",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_552",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "dtype_cast_347",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_347",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "alias_default_1652",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_312",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w2",
+      "name": "alias_default_1005",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1005",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_726",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "mul_352",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1005",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_728",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "mul_353",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_352",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "alias_default_1006",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1006",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_722",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "einsum_default_313",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_727",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "permute_555",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1006",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_555",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "einsum_default_314",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_313",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "permute_556",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_556",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "dtype_cast_348",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_348",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w3",
+      "name": "alias_default_1653",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_353",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "convert_element_type_1059",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_724",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "convert_element_type_1060",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1060",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "alias_default_1007",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1007",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "neg_38",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "exp_38",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "add_203",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_203",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "reciprocal_6",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_6",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "mul_354",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_354",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "alias_default_1008",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1059",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1008",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "mul_355",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1008",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "sub_19",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1007",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "mul_356",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_356",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "add_204",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_355",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_204",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "mul_357",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_357",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "convert_element_type_1061",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1061",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward",
+      "name": "alias_default_1009",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1009",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_722",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "einsum_default_315",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_723",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "permute_559",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1009",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_559",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "einsum_default_316",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_314",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_316",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25",
+      "name": "add_205",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_315",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "permute_560",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_560",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "dtype_cast_349",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_349",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.feed_forward.w1",
+      "name": "alias_default_1651",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_205",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "convert_element_type_1066",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_718",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "convert_element_type_1067",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_719",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "convert_element_type_1068",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1066",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "alias_default_1010",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1010",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1068",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "mul_358",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1067",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_721",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "mul_359",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_358",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "alias_default_1011",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_359",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "alias_default_1012",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1012",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1011",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "mul_360",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_360",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "sum_39",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1012",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "div_45",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_45",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_39",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "mul_361",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1011",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_361",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "sub_20",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_721",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "mul_362",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1010",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1012",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "mul_363",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_363",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "sum_40",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_362",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "convert_element_type_1069",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_40",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "convert_element_type_1070",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1004",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1069",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "add_206",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1070",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "dtype_cast_350",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_350",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.ffn_norm",
+      "name": "alias_default_1655",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_206",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "alias_default_1013",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1013",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_716",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "einsum_default_317",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_717",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "permute_563",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1013",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_563",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "einsum_default_318",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_317",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "permute_564",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_564",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "dtype_cast_351",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_351",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wo",
+      "name": "alias_default_1650",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_318",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_956",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_956",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "permute_565",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_565",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_712",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_713",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_714",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_715",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_226",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_231",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_232",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_6",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.sdpa",
+      "name": "getitem_306",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.sdpa",
+      "name": "getitem_307",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.sdpa",
+      "name": "getitem_308",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_308",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "permute_566",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_307",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "permute_567",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_306",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "permute_568",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_566",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_957",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_957",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "sum_41",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_41",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "squeeze_12",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_567",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_958",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_958",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "sum_42",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "squeeze_13",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "convert_element_type_1075",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_568",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "convert_element_type_1076",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1075",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_959",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_959",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_as_complex_76",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_711",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "_conj_12",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_12",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "clone_118",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_76",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_118",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "mul_364",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1076",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_960",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_960",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_as_complex_77",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_711",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "_conj_13",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_13",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "clone_119",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_77",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_119",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "mul_365",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_364",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_as_real_76",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_76",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_961",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_961",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "convert_element_type_1077",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_365",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_as_real_77",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_77",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_962",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_962",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "convert_element_type_1078",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_963",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1077",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_964",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1078",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "view_965",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_963",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "alias_default_1014",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1014",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_707",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wv",
+      "name": "einsum_default_319",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_710",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wv",
+      "name": "permute_571",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1014",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_571",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wv",
+      "name": "einsum_default_320",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_319",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wv",
+      "name": "permute_572",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_572",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wv",
+      "name": "dtype_cast_352",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_352",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wv",
+      "name": "alias_default_1649",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_964",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "alias_default_1015",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1015",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_707",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wk",
+      "name": "einsum_default_321",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_709",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wk",
+      "name": "permute_575",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1015",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_575",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wk",
+      "name": "einsum_default_322",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_320",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_322",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "add_207",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_321",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wk",
+      "name": "permute_576",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_576",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wk",
+      "name": "dtype_cast_353",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_353",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wk",
+      "name": "alias_default_1648",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_965",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention",
+      "name": "alias_default_1016",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1016",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_707",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wq",
+      "name": "einsum_default_323",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_708",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wq",
+      "name": "permute_579",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1016",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_579",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wq",
+      "name": "einsum_default_324",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_207",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_324",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25",
+      "name": "add_208",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_323",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wq",
+      "name": "permute_580",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_580",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wq",
+      "name": "dtype_cast_354",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_354",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention.wq",
+      "name": "alias_default_1647",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_208",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "convert_element_type_1091",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_703",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "convert_element_type_1092",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_704",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "convert_element_type_1093",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1091",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "alias_default_1017",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1017",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1093",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "mul_366",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1092",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_706",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "mul_367",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_366",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "alias_default_1018",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_367",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "alias_default_1019",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1019",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1018",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "mul_368",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_368",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "sum_43",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1019",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "div_46",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_46",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_43",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "mul_369",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1018",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_369",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "sub_21",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_706",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "mul_370",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1017",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1019",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "mul_371",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_371",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "sum_44",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_370",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "convert_element_type_1094",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_44",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "convert_element_type_1095",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1013",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1094",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "add_209",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1095",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "dtype_cast_355",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_355",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.25.attention_norm",
+      "name": "alias_default_1654",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_209",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "alias_default_1020",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1020",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_701",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "einsum_default_325",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_702",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "permute_583",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1020",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_583",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "einsum_default_326",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_325",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "permute_584",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_584",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "dtype_cast_356",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_356",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "alias_default_1643",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_326",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w2",
+      "name": "alias_default_1021",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1021",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_698",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "mul_372",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1021",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_700",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "mul_373",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_372",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "alias_default_1022",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1022",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_694",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "einsum_default_327",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_699",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "permute_587",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1022",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_587",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "einsum_default_328",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_327",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "permute_588",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_588",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "dtype_cast_357",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_357",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w3",
+      "name": "alias_default_1644",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_373",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "convert_element_type_1104",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_696",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "convert_element_type_1105",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1105",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "alias_default_1023",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1023",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "neg_39",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "exp_39",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "add_210",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_210",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "reciprocal_7",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_7",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "mul_374",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_374",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "alias_default_1024",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1104",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1024",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "mul_375",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1024",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "sub_22",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1023",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "mul_376",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_376",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "add_211",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_375",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_211",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "mul_377",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_377",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "convert_element_type_1106",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1106",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward",
+      "name": "alias_default_1025",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1025",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_694",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "einsum_default_329",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_695",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "permute_591",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1025",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_591",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "einsum_default_330",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_328",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_330",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24",
+      "name": "add_212",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_329",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "permute_592",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_592",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "dtype_cast_358",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_358",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.feed_forward.w1",
+      "name": "alias_default_1642",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_212",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "convert_element_type_1111",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_690",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "convert_element_type_1112",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_691",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "convert_element_type_1113",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1111",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "alias_default_1026",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1026",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1113",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "mul_378",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1112",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_693",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "mul_379",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_378",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "alias_default_1027",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_379",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "alias_default_1028",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1028",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1027",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "mul_380",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_380",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "sum_45",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1028",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "div_47",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_47",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_45",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "mul_381",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1027",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_381",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "sub_23",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_23",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_693",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "mul_382",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1026",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1028",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "mul_383",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_383",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "sum_46",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_382",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "convert_element_type_1114",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_46",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "convert_element_type_1115",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1020",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1114",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "add_213",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1115",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "dtype_cast_359",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_359",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.ffn_norm",
+      "name": "alias_default_1646",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_213",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "alias_default_1029",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1029",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_688",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "einsum_default_331",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_689",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "permute_595",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1029",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_595",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "einsum_default_332",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_331",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "permute_596",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_596",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "dtype_cast_360",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_360",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wo",
+      "name": "alias_default_1641",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_332",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_980",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_980",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "permute_597",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_597",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_684",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_685",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_686",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_687",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_217",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_222",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_223",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_7",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.sdpa",
+      "name": "getitem_309",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.sdpa",
+      "name": "getitem_310",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_7",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.sdpa",
+      "name": "getitem_311",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_311",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "permute_598",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_310",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "permute_599",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_309",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "permute_600",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_598",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_981",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_981",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "sum_47",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_47",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "squeeze_14",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_599",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_982",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_982",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "sum_48",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "squeeze_15",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "convert_element_type_1120",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_600",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "convert_element_type_1121",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1120",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_983",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_983",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_as_complex_78",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_683",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "_conj_14",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_14",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "clone_126",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_78",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_126",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "mul_384",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1121",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_984",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_984",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_as_complex_79",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_683",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "_conj_15",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_15",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "clone_127",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_79",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_127",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "mul_385",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_384",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_as_real_78",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_78",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_985",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_985",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "convert_element_type_1122",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_385",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_as_real_79",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_79",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_986",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_986",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "convert_element_type_1123",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_987",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1122",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_988",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1123",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "view_989",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_987",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "alias_default_1030",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1030",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_679",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wv",
+      "name": "einsum_default_333",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_682",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wv",
+      "name": "permute_603",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1030",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_603",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wv",
+      "name": "einsum_default_334",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_333",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wv",
+      "name": "permute_604",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_604",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wv",
+      "name": "dtype_cast_361",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_361",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wv",
+      "name": "alias_default_1640",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_988",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "alias_default_1031",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1031",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_679",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wk",
+      "name": "einsum_default_335",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_681",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wk",
+      "name": "permute_607",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1031",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_607",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wk",
+      "name": "einsum_default_336",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_334",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_336",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "add_214",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_335",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wk",
+      "name": "permute_608",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_608",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wk",
+      "name": "dtype_cast_362",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_362",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wk",
+      "name": "alias_default_1639",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_989",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention",
+      "name": "alias_default_1032",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1032",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_679",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wq",
+      "name": "einsum_default_337",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_680",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wq",
+      "name": "permute_611",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1032",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_611",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wq",
+      "name": "einsum_default_338",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_214",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_338",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24",
+      "name": "add_215",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_337",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wq",
+      "name": "permute_612",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_612",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wq",
+      "name": "dtype_cast_363",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_363",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention.wq",
+      "name": "alias_default_1638",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_215",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "convert_element_type_1136",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_675",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "convert_element_type_1137",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_676",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "convert_element_type_1138",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1136",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "alias_default_1033",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1033",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1138",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "mul_386",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1137",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_678",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "mul_387",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_386",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "alias_default_1034",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_387",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "alias_default_1035",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1035",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1034",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "mul_388",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_388",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "sum_49",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1035",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "div_48",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_48",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "mul_389",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1034",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_389",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "sub_24",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_678",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "mul_390",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1033",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1035",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "mul_391",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_391",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "sum_50",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_390",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "convert_element_type_1139",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_50",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "convert_element_type_1140",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1029",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1139",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "add_216",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1140",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "dtype_cast_364",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_364",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.24.attention_norm",
+      "name": "alias_default_1645",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_216",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "alias_default_1036",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1036",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_673",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "einsum_default_339",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_674",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "permute_615",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1036",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_615",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "einsum_default_340",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_339",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "permute_616",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_616",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "dtype_cast_365",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_365",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "alias_default_1634",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_340",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w2",
+      "name": "alias_default_1037",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1037",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_670",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "mul_392",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1037",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_672",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "mul_393",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_392",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "alias_default_1038",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1038",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_666",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "einsum_default_341",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_671",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "permute_619",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1038",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_619",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "einsum_default_342",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_341",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "permute_620",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_620",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "dtype_cast_366",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_366",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w3",
+      "name": "alias_default_1635",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_393",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "convert_element_type_1149",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_668",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "convert_element_type_1150",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1150",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "alias_default_1039",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1039",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "neg_40",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "exp_40",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "add_217",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_217",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "reciprocal_8",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_8",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "mul_394",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_394",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "alias_default_1040",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1149",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1040",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "mul_395",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1040",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "sub_25",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1039",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "mul_396",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_396",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "add_218",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_395",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_218",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "mul_397",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_397",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "convert_element_type_1151",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1151",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward",
+      "name": "alias_default_1041",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1041",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_666",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "einsum_default_343",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_667",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "permute_623",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1041",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_623",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "einsum_default_344",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_342",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_344",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23",
+      "name": "add_219",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_343",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "permute_624",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_624",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "dtype_cast_367",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_367",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.feed_forward.w1",
+      "name": "alias_default_1633",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_219",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "convert_element_type_1156",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_662",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "convert_element_type_1157",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_663",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "convert_element_type_1158",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1156",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "alias_default_1042",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1042",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1158",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "mul_398",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1157",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_665",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "mul_399",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_398",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "alias_default_1043",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_399",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "alias_default_1044",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1044",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1043",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "mul_400",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_400",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "sum_51",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1044",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "div_49",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_51",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "mul_401",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1043",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_401",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "sub_26",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_665",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "mul_402",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1042",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1044",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "mul_403",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_403",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "sum_52",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_402",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "convert_element_type_1159",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_52",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "convert_element_type_1160",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1036",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1159",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "add_220",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1160",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "dtype_cast_368",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_368",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.ffn_norm",
+      "name": "alias_default_1637",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_220",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "alias_default_1045",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1045",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_660",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "einsum_default_345",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_661",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "permute_627",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1045",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_627",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "einsum_default_346",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_345",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "permute_628",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_628",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "dtype_cast_369",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_369",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wo",
+      "name": "alias_default_1632",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_346",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_1004",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1004",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "permute_629",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_629",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_656",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_657",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_658",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_659",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_208",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_213",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_214",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_8",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_8",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.sdpa",
+      "name": "getitem_312",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_8",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.sdpa",
+      "name": "getitem_313",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_8",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.sdpa",
+      "name": "getitem_314",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_314",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "permute_630",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_313",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "permute_631",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_312",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "permute_632",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_630",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_1005",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1005",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "sum_53",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "squeeze_16",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_631",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_1006",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1006",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "sum_54",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "squeeze_17",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "convert_element_type_1165",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_632",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "convert_element_type_1166",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1165",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_1007",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1007",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_as_complex_80",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_655",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "_conj_16",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_16",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "clone_134",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_80",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_134",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "mul_404",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1166",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_1008",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1008",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_as_complex_81",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_655",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "_conj_17",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_17",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "clone_135",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_81",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_135",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "mul_405",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_404",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_as_real_80",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_80",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_1009",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1009",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "convert_element_type_1167",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_405",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_as_real_81",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_81",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_1010",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1010",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "convert_element_type_1168",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_1011",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1167",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_1012",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1168",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "view_1013",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1011",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "alias_default_1046",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1046",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_651",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wv",
+      "name": "einsum_default_347",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_654",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wv",
+      "name": "permute_635",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1046",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_635",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wv",
+      "name": "einsum_default_348",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_347",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wv",
+      "name": "permute_636",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_636",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wv",
+      "name": "dtype_cast_370",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_370",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wv",
+      "name": "alias_default_1631",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1012",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "alias_default_1047",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1047",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_651",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wk",
+      "name": "einsum_default_349",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_653",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wk",
+      "name": "permute_639",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1047",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_639",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wk",
+      "name": "einsum_default_350",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_348",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_350",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "add_221",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_349",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wk",
+      "name": "permute_640",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_640",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wk",
+      "name": "dtype_cast_371",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_371",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wk",
+      "name": "alias_default_1630",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1013",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention",
+      "name": "alias_default_1048",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1048",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_651",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wq",
+      "name": "einsum_default_351",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_652",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wq",
+      "name": "permute_643",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1048",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_643",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wq",
+      "name": "einsum_default_352",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_221",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_352",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23",
+      "name": "add_222",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_351",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wq",
+      "name": "permute_644",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_644",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wq",
+      "name": "dtype_cast_372",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_372",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention.wq",
+      "name": "alias_default_1629",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_222",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "convert_element_type_1181",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_647",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "convert_element_type_1182",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_648",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "convert_element_type_1183",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1181",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "alias_default_1049",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1049",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1183",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "mul_406",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1182",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_650",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "mul_407",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_406",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "alias_default_1050",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_407",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "alias_default_1051",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1051",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1050",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "mul_408",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_408",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "sum_55",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1051",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "div_50",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_50",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_55",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "mul_409",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1050",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_409",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "sub_27",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_650",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "mul_410",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1049",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1051",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "mul_411",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_411",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "sum_56",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_410",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "convert_element_type_1184",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_56",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "convert_element_type_1185",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1045",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1184",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "add_223",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1185",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "dtype_cast_373",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_373",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.23.attention_norm",
+      "name": "alias_default_1636",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_223",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "alias_default_1052",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1052",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_645",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "einsum_default_353",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_646",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "permute_647",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1052",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_647",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "einsum_default_354",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_353",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "permute_648",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_648",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "dtype_cast_374",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_374",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "alias_default_1625",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_354",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w2",
+      "name": "alias_default_1053",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1053",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_642",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "mul_412",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1053",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_644",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "mul_413",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_412",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "alias_default_1054",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1054",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_638",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "einsum_default_355",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_643",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "permute_651",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1054",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_651",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "einsum_default_356",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_355",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "permute_652",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_652",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "dtype_cast_375",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_375",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w3",
+      "name": "alias_default_1626",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_413",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "convert_element_type_1194",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_640",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "convert_element_type_1195",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1195",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "alias_default_1055",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1055",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "neg_41",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_41",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "exp_41",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_41",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "add_224",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_224",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "reciprocal_9",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_9",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "mul_414",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_414",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "alias_default_1056",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1194",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1056",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "mul_415",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1056",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "sub_28",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1055",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "mul_416",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_416",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "add_225",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_415",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_225",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "mul_417",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_417",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "convert_element_type_1196",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1196",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward",
+      "name": "alias_default_1057",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1057",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_638",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "einsum_default_357",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_639",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "permute_655",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1057",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_655",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "einsum_default_358",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_356",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_358",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22",
+      "name": "add_226",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_357",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "permute_656",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_656",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "dtype_cast_376",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_376",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.feed_forward.w1",
+      "name": "alias_default_1624",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_226",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "convert_element_type_1201",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_634",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "convert_element_type_1202",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_635",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "convert_element_type_1203",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1201",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "alias_default_1058",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1058",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1203",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "mul_418",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1202",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_637",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "mul_419",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_418",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "alias_default_1059",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_419",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "alias_default_1060",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1060",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1059",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "mul_420",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_420",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "sum_57",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1060",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "div_51",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_51",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_57",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "mul_421",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1059",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_421",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "sub_29",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_637",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "mul_422",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1058",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1060",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "mul_423",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_423",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "sum_58",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_422",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "convert_element_type_1204",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_58",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "convert_element_type_1205",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1052",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1204",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "add_227",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1205",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "dtype_cast_377",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_377",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.ffn_norm",
+      "name": "alias_default_1628",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_227",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "alias_default_1061",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1061",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_632",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "einsum_default_359",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_633",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "permute_659",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1061",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_659",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "einsum_default_360",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_359",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "permute_660",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_660",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "dtype_cast_378",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_378",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wo",
+      "name": "alias_default_1623",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_360",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_1028",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1028",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "permute_661",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_661",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_628",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_629",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_630",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_631",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_199",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_204",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_205",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_9",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.sdpa",
+      "name": "getitem_315",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.sdpa",
+      "name": "getitem_316",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_9",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.sdpa",
+      "name": "getitem_317",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_317",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "permute_662",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_316",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "permute_663",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_315",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "permute_664",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_662",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_1029",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1029",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "sum_59",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "squeeze_18",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_663",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_1030",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1030",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "sum_60",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "squeeze_19",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "convert_element_type_1210",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_664",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "convert_element_type_1211",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1210",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_1031",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1031",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_as_complex_82",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_627",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "_conj_18",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_18",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "clone_142",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_82",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_142",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "mul_424",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1211",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_1032",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1032",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_as_complex_83",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_627",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "_conj_19",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_19",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "clone_143",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_83",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_143",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "mul_425",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_424",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_as_real_82",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_82",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_1033",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1033",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "convert_element_type_1212",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_425",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_as_real_83",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_83",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_1034",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1034",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "convert_element_type_1213",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_1035",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1212",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_1036",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1213",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "view_1037",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1035",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "alias_default_1062",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1062",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_623",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wv",
+      "name": "einsum_default_361",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_626",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wv",
+      "name": "permute_667",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1062",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_667",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wv",
+      "name": "einsum_default_362",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_361",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wv",
+      "name": "permute_668",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_668",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wv",
+      "name": "dtype_cast_379",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_379",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wv",
+      "name": "alias_default_1622",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1036",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "alias_default_1063",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1063",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_623",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wk",
+      "name": "einsum_default_363",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_625",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wk",
+      "name": "permute_671",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1063",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_671",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wk",
+      "name": "einsum_default_364",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_362",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_364",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "add_228",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_363",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wk",
+      "name": "permute_672",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_672",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wk",
+      "name": "dtype_cast_380",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_380",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wk",
+      "name": "alias_default_1621",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1037",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention",
+      "name": "alias_default_1064",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1064",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_623",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wq",
+      "name": "einsum_default_365",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_624",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wq",
+      "name": "permute_675",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1064",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_675",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wq",
+      "name": "einsum_default_366",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_228",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_366",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22",
+      "name": "add_229",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_365",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wq",
+      "name": "permute_676",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_676",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wq",
+      "name": "dtype_cast_381",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_381",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention.wq",
+      "name": "alias_default_1620",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_229",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "convert_element_type_1226",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_619",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "convert_element_type_1227",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_620",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "convert_element_type_1228",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1226",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "alias_default_1065",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1065",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1228",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "mul_426",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1227",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_622",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "mul_427",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_426",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "alias_default_1066",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_427",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "alias_default_1067",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1067",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1066",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "mul_428",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_428",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "sum_61",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1067",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "div_52",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_52",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_61",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "mul_429",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1066",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_429",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "sub_30",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_30",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_622",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "mul_430",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1065",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1067",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "mul_431",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_431",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "sum_62",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_430",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "convert_element_type_1229",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_62",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "convert_element_type_1230",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1061",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1229",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "add_230",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1230",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "dtype_cast_382",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_382",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.22.attention_norm",
+      "name": "alias_default_1627",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_230",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "alias_default_1068",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1068",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_617",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "einsum_default_367",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_618",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "permute_679",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1068",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_679",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "einsum_default_368",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_367",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "permute_680",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_680",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "dtype_cast_383",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_383",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "alias_default_1616",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_368",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w2",
+      "name": "alias_default_1069",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1069",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_614",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "mul_432",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1069",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_616",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "mul_433",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_432",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "alias_default_1070",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1070",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_610",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "einsum_default_369",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_615",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "permute_683",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1070",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_683",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "einsum_default_370",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_369",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "permute_684",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_684",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "dtype_cast_384",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_384",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w3",
+      "name": "alias_default_1617",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_433",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "convert_element_type_1239",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_612",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "convert_element_type_1240",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1240",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "alias_default_1071",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1071",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "neg_42",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "exp_42",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "add_231",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_231",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "reciprocal_10",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_10",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "mul_434",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_434",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "alias_default_1072",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1239",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1072",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "mul_435",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1072",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "sub_31",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1071",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "mul_436",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_436",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "add_232",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_435",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_232",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "mul_437",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_437",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "convert_element_type_1241",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1241",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward",
+      "name": "alias_default_1073",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1073",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_610",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "einsum_default_371",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_611",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "permute_687",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1073",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_687",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "einsum_default_372",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_370",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_372",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21",
+      "name": "add_233",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_371",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "permute_688",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_688",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "dtype_cast_385",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_385",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.feed_forward.w1",
+      "name": "alias_default_1615",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_233",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "convert_element_type_1246",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_606",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "convert_element_type_1247",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_607",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "convert_element_type_1248",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1246",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "alias_default_1074",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1074",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1248",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "mul_438",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1247",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_609",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "mul_439",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_438",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "alias_default_1075",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_439",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "alias_default_1076",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1076",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1075",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "mul_440",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_440",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "sum_63",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1076",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "div_53",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_53",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_63",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "mul_441",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1075",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_441",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "sub_32",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_32",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_609",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "mul_442",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1074",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1076",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "mul_443",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_443",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "sum_64",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_442",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "convert_element_type_1249",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_64",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "convert_element_type_1250",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1068",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1249",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "add_234",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1250",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "dtype_cast_386",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_386",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.ffn_norm",
+      "name": "alias_default_1619",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_234",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "alias_default_1077",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1077",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_604",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "einsum_default_373",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_605",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "permute_691",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1077",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_691",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "einsum_default_374",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_373",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "permute_692",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_692",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "dtype_cast_387",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_387",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wo",
+      "name": "alias_default_1614",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_374",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_1052",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1052",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "permute_693",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_693",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_600",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_601",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_602",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_603",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_190",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_195",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_196",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_10",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.sdpa",
+      "name": "getitem_318",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.sdpa",
+      "name": "getitem_319",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.sdpa",
+      "name": "getitem_320",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_320",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "permute_694",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_319",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "permute_695",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_318",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "permute_696",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_694",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_1053",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1053",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "sum_65",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_65",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "squeeze_20",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_695",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_1054",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1054",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "sum_66",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_66",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "squeeze_21",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "convert_element_type_1255",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_696",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "convert_element_type_1256",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1255",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_1055",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1055",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_as_complex_84",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_599",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "_conj_20",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_20",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "clone_150",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_84",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_150",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "mul_444",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1256",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_1056",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1056",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_as_complex_85",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_599",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "_conj_21",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_21",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "clone_151",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_85",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_151",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "mul_445",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_444",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_as_real_84",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_84",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_1057",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1057",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "convert_element_type_1257",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_445",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_as_real_85",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_85",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_1058",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1058",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "convert_element_type_1258",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_1059",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1257",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_1060",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1258",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "view_1061",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1059",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "alias_default_1078",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1078",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_595",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wv",
+      "name": "einsum_default_375",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_598",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wv",
+      "name": "permute_699",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1078",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_699",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wv",
+      "name": "einsum_default_376",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_375",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wv",
+      "name": "permute_700",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_700",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wv",
+      "name": "dtype_cast_388",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_388",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wv",
+      "name": "alias_default_1613",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1060",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "alias_default_1079",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1079",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_595",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wk",
+      "name": "einsum_default_377",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_597",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wk",
+      "name": "permute_703",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1079",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_703",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wk",
+      "name": "einsum_default_378",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_376",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_378",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "add_235",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_377",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wk",
+      "name": "permute_704",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_704",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wk",
+      "name": "dtype_cast_389",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_389",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wk",
+      "name": "alias_default_1612",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1061",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention",
+      "name": "alias_default_1080",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1080",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_595",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wq",
+      "name": "einsum_default_379",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_596",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wq",
+      "name": "permute_707",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1080",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_707",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wq",
+      "name": "einsum_default_380",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_235",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_380",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21",
+      "name": "add_236",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_379",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wq",
+      "name": "permute_708",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_708",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wq",
+      "name": "dtype_cast_390",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_390",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention.wq",
+      "name": "alias_default_1611",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_236",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "convert_element_type_1271",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_591",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "convert_element_type_1272",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_592",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "convert_element_type_1273",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1271",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "alias_default_1081",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1081",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1273",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "mul_446",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1272",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_594",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "mul_447",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_446",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "alias_default_1082",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_447",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "alias_default_1083",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1083",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1082",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "mul_448",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_448",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "sum_67",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1083",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "div_54",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_54",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_67",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "mul_449",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1082",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_449",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "sub_33",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_33",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_594",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "mul_450",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1081",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1083",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "mul_451",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_451",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "sum_68",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_450",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "convert_element_type_1274",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_68",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "convert_element_type_1275",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1077",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1274",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "add_237",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1275",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "dtype_cast_391",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_391",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.21.attention_norm",
+      "name": "alias_default_1618",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_237",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "alias_default_1084",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1084",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_589",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "einsum_default_381",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_590",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "permute_711",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1084",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_711",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "einsum_default_382",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_381",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "permute_712",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_712",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "dtype_cast_392",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_392",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "alias_default_1607",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_382",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w2",
+      "name": "alias_default_1085",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1085",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_586",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "mul_452",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1085",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_588",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "mul_453",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_452",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "alias_default_1086",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1086",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_582",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "einsum_default_383",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_587",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "permute_715",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1086",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_715",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "einsum_default_384",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_383",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "permute_716",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_716",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "dtype_cast_393",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_393",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w3",
+      "name": "alias_default_1608",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_453",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "convert_element_type_1284",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_584",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "convert_element_type_1285",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1285",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "alias_default_1087",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1087",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "neg_43",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "exp_43",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "add_238",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_238",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "reciprocal_11",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_11",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "mul_454",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_454",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "alias_default_1088",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1284",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1088",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "mul_455",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1088",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "sub_34",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1087",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "mul_456",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_456",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "add_239",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_455",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_239",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "mul_457",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_457",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "convert_element_type_1286",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1286",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward",
+      "name": "alias_default_1089",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1089",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_582",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "einsum_default_385",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_583",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "permute_719",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1089",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_719",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "einsum_default_386",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_384",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_386",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20",
+      "name": "add_240",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_385",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "permute_720",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_720",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "dtype_cast_394",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_394",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.feed_forward.w1",
+      "name": "alias_default_1606",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_240",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "convert_element_type_1291",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_578",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "convert_element_type_1292",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_579",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "convert_element_type_1293",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1291",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "alias_default_1090",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1090",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1293",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "mul_458",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1292",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_581",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "mul_459",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_458",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "alias_default_1091",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_459",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "alias_default_1092",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1092",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1091",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "mul_460",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_460",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "sum_69",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1092",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "div_55",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_55",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_69",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "mul_461",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1091",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_461",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "sub_35",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_35",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_581",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "mul_462",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1090",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1092",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "mul_463",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_463",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "sum_70",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_462",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "convert_element_type_1294",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_70",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "convert_element_type_1295",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1084",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1294",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "add_241",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1295",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "dtype_cast_395",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_395",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.ffn_norm",
+      "name": "alias_default_1610",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_241",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "alias_default_1093",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1093",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_576",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "einsum_default_387",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_577",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "permute_723",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1093",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_723",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "einsum_default_388",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_387",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "permute_724",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_724",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "dtype_cast_396",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_396",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wo",
+      "name": "alias_default_1605",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_388",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_1076",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1076",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "permute_725",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_725",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_572",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_573",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_574",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_575",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_181",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_186",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_187",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_11",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.sdpa",
+      "name": "getitem_321",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.sdpa",
+      "name": "getitem_322",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_11",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.sdpa",
+      "name": "getitem_323",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_323",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "permute_726",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_322",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "permute_727",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_321",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "permute_728",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_726",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_1077",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1077",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "sum_71",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_71",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "squeeze_22",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_727",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_1078",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1078",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "sum_72",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_72",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "squeeze_23",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "convert_element_type_1300",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_728",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "convert_element_type_1301",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1300",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_1079",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1079",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_as_complex_86",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_571",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "_conj_22",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_22",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "clone_158",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_86",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_158",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "mul_464",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1301",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_1080",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1080",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_as_complex_87",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_571",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "_conj_23",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_23",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "clone_159",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_87",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_159",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "mul_465",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_464",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_as_real_86",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_86",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_1081",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1081",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "convert_element_type_1302",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_465",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_as_real_87",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_87",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_1082",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1082",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "convert_element_type_1303",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_1083",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1302",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_1084",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1303",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "view_1085",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1083",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "alias_default_1094",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1094",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_567",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wv",
+      "name": "einsum_default_389",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_570",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wv",
+      "name": "permute_731",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1094",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_731",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wv",
+      "name": "einsum_default_390",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_389",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wv",
+      "name": "permute_732",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_732",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wv",
+      "name": "dtype_cast_397",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_397",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wv",
+      "name": "alias_default_1604",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1084",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "alias_default_1095",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1095",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_567",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wk",
+      "name": "einsum_default_391",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_569",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wk",
+      "name": "permute_735",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1095",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_735",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wk",
+      "name": "einsum_default_392",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_390",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_392",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "add_242",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_391",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wk",
+      "name": "permute_736",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_736",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wk",
+      "name": "dtype_cast_398",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_398",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wk",
+      "name": "alias_default_1603",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1085",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention",
+      "name": "alias_default_1096",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1096",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_567",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wq",
+      "name": "einsum_default_393",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_568",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wq",
+      "name": "permute_739",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1096",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_739",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wq",
+      "name": "einsum_default_394",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_242",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_394",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20",
+      "name": "add_243",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_393",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wq",
+      "name": "permute_740",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_740",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wq",
+      "name": "dtype_cast_399",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_399",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention.wq",
+      "name": "alias_default_1602",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_243",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "convert_element_type_1316",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_563",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "convert_element_type_1317",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_564",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "convert_element_type_1318",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1316",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "alias_default_1097",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1097",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1318",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "mul_466",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1317",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_566",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "mul_467",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_466",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "alias_default_1098",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_467",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "alias_default_1099",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1099",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1098",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "mul_468",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_468",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "sum_73",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1099",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "div_56",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_56",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_73",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "mul_469",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1098",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_469",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "sub_36",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_36",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_566",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "mul_470",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1097",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1099",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "mul_471",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_471",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "sum_74",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_470",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "convert_element_type_1319",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_74",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "convert_element_type_1320",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1093",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1319",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "add_244",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1320",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "dtype_cast_400",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_400",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.20.attention_norm",
+      "name": "alias_default_1609",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_244",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "alias_default_1100",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1100",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_561",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "einsum_default_395",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_562",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "permute_743",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1100",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_743",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "einsum_default_396",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_395",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "permute_744",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_744",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "dtype_cast_401",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_401",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "alias_default_1598",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_396",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w2",
+      "name": "alias_default_1101",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1101",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_558",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "mul_472",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1101",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_560",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "mul_473",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_472",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "alias_default_1102",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1102",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_554",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "einsum_default_397",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_559",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "permute_747",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1102",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_747",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "einsum_default_398",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_397",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "permute_748",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_748",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "dtype_cast_402",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_402",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w3",
+      "name": "alias_default_1599",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_473",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "convert_element_type_1329",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_556",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "convert_element_type_1330",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1330",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "alias_default_1103",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1103",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "neg_44",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_44",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "exp_44",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_44",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "add_245",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_245",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "reciprocal_12",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_12",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "mul_474",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_474",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "alias_default_1104",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1329",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1104",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "mul_475",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1104",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "sub_37",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1103",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "mul_476",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_476",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "add_246",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_475",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_246",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "mul_477",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_477",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "convert_element_type_1331",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1331",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward",
+      "name": "alias_default_1105",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1105",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_554",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "einsum_default_399",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_555",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "permute_751",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1105",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_751",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "einsum_default_400",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_398",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_400",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19",
+      "name": "add_247",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_399",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "permute_752",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_752",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "dtype_cast_403",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_403",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.feed_forward.w1",
+      "name": "alias_default_1597",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_247",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "convert_element_type_1336",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_550",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "convert_element_type_1337",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_551",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "convert_element_type_1338",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1336",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "alias_default_1106",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1106",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1338",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "mul_478",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1337",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_553",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "mul_479",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_478",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "alias_default_1107",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_479",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "alias_default_1108",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1108",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1107",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "mul_480",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_480",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "sum_75",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1108",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "div_57",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_57",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_75",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "mul_481",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1107",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_481",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "sub_38",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_38",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_553",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "mul_482",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1106",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1108",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "mul_483",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_483",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "sum_76",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_482",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "convert_element_type_1339",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_76",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "convert_element_type_1340",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1100",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1339",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "add_248",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1340",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "dtype_cast_404",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_404",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.ffn_norm",
+      "name": "alias_default_1601",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_248",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "alias_default_1109",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1109",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_548",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "einsum_default_401",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_549",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "permute_755",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1109",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_755",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "einsum_default_402",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_401",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "permute_756",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_756",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "dtype_cast_405",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_405",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wo",
+      "name": "alias_default_1596",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_402",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_1100",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1100",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "permute_757",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_757",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_544",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_545",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_546",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_547",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_172",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_177",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_178",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_12",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.sdpa",
+      "name": "getitem_324",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.sdpa",
+      "name": "getitem_325",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.sdpa",
+      "name": "getitem_326",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_326",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "permute_758",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_325",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "permute_759",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_324",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "permute_760",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_758",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_1101",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1101",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "sum_77",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_77",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "squeeze_24",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_759",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_1102",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1102",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "sum_78",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_78",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "squeeze_25",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "convert_element_type_1345",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_760",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "convert_element_type_1346",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1345",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_1103",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1103",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_as_complex_88",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_543",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "_conj_24",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_24",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "clone_166",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_88",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_166",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "mul_484",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1346",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_1104",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1104",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_as_complex_89",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_543",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "_conj_25",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_25",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "clone_167",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_89",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_167",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "mul_485",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_484",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_as_real_88",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_88",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_1105",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1105",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "convert_element_type_1347",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_485",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_as_real_89",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_89",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_1106",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1106",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "convert_element_type_1348",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_1107",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1347",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_1108",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1348",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "view_1109",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1107",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "alias_default_1110",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1110",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_539",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wv",
+      "name": "einsum_default_403",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_542",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wv",
+      "name": "permute_763",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1110",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_763",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wv",
+      "name": "einsum_default_404",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_403",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wv",
+      "name": "permute_764",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_764",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wv",
+      "name": "dtype_cast_406",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_406",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wv",
+      "name": "alias_default_1595",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1108",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "alias_default_1111",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1111",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_539",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wk",
+      "name": "einsum_default_405",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_541",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wk",
+      "name": "permute_767",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1111",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_767",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wk",
+      "name": "einsum_default_406",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_404",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_406",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "add_249",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_405",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wk",
+      "name": "permute_768",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_768",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wk",
+      "name": "dtype_cast_407",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_407",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wk",
+      "name": "alias_default_1594",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1109",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention",
+      "name": "alias_default_1112",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1112",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_539",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wq",
+      "name": "einsum_default_407",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_540",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wq",
+      "name": "permute_771",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1112",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_771",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wq",
+      "name": "einsum_default_408",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_249",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_408",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19",
+      "name": "add_250",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_407",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wq",
+      "name": "permute_772",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_772",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wq",
+      "name": "dtype_cast_408",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_408",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention.wq",
+      "name": "alias_default_1593",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_250",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "convert_element_type_1361",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_535",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "convert_element_type_1362",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_536",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "convert_element_type_1363",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1361",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "alias_default_1113",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1113",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1363",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "mul_486",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1362",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_538",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "mul_487",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_486",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "alias_default_1114",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_487",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "alias_default_1115",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1114",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "mul_488",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_488",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "sum_79",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "div_58",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_58",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_79",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "mul_489",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1114",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_489",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "sub_39",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_39",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_538",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "mul_490",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1113",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "mul_491",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_491",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "sum_80",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_490",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "convert_element_type_1364",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_80",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "convert_element_type_1365",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1109",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1364",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "add_251",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1365",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "dtype_cast_409",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_409",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.19.attention_norm",
+      "name": "alias_default_1600",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_251",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "alias_default_1116",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1116",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_533",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "einsum_default_409",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_534",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "permute_775",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1116",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_775",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "einsum_default_410",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_409",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "permute_776",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_776",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "dtype_cast_410",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_410",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "alias_default_1589",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_410",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w2",
+      "name": "alias_default_1117",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1117",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_530",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "mul_492",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1117",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_532",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "mul_493",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_492",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "alias_default_1118",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1118",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_526",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "einsum_default_411",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_531",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "permute_779",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1118",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_779",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "einsum_default_412",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_411",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "permute_780",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_780",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "dtype_cast_411",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_411",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w3",
+      "name": "alias_default_1590",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_493",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "convert_element_type_1374",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_528",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "convert_element_type_1375",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1375",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "alias_default_1119",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1119",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "neg_45",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_45",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "exp_45",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_45",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "add_252",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_252",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "reciprocal_13",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_13",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "mul_494",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_494",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "alias_default_1120",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1374",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1120",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "mul_495",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1120",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "sub_40",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1119",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "mul_496",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_496",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "add_253",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_495",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_253",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "mul_497",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_497",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "convert_element_type_1376",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1376",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward",
+      "name": "alias_default_1121",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1121",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_526",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "einsum_default_413",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_527",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "permute_783",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1121",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_783",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "einsum_default_414",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_412",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_414",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18",
+      "name": "add_254",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_413",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "permute_784",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_784",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "dtype_cast_412",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_412",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.feed_forward.w1",
+      "name": "alias_default_1588",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_254",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "convert_element_type_1381",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_522",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "convert_element_type_1382",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_523",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "convert_element_type_1383",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1381",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "alias_default_1122",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1122",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1383",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "mul_498",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1382",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_525",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "mul_499",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_498",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "alias_default_1123",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_499",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "alias_default_1124",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1124",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1123",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "mul_500",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_500",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "sum_81",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1124",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "div_59",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_81",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "mul_501",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1123",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_501",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "sub_41",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_41",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_525",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "mul_502",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1122",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1124",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "mul_503",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_503",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "sum_82",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_502",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "convert_element_type_1384",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_82",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "convert_element_type_1385",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1116",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1384",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "add_255",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1385",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "dtype_cast_413",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_413",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.ffn_norm",
+      "name": "alias_default_1592",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_255",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "alias_default_1125",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1125",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_520",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "einsum_default_415",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_521",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "permute_787",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1125",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_787",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "einsum_default_416",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_415",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "permute_788",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_788",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "dtype_cast_414",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_414",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wo",
+      "name": "alias_default_1587",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_416",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_1124",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1124",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "permute_789",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_789",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_516",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_517",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_518",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_519",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_163",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_168",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_169",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_13",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.sdpa",
+      "name": "getitem_327",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.sdpa",
+      "name": "getitem_328",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.sdpa",
+      "name": "getitem_329",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_329",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "permute_790",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_328",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "permute_791",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_327",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "permute_792",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_790",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_1125",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1125",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "sum_83",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_83",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "squeeze_26",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_791",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_1126",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1126",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "sum_84",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_84",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "squeeze_27",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "convert_element_type_1390",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_792",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "convert_element_type_1391",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1390",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_1127",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1127",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_as_complex_90",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_515",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "_conj_26",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_26",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "clone_174",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_90",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_174",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "mul_504",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1391",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_1128",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1128",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_as_complex_91",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_515",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "_conj_27",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_27",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "clone_175",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_91",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_175",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "mul_505",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_504",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_as_real_90",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_90",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_1129",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1129",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "convert_element_type_1392",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_505",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_as_real_91",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_91",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_1130",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1130",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "convert_element_type_1393",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_1131",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1392",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_1132",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1393",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "view_1133",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1131",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "alias_default_1126",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1126",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_511",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wv",
+      "name": "einsum_default_417",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_514",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wv",
+      "name": "permute_795",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1126",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_795",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wv",
+      "name": "einsum_default_418",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_417",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wv",
+      "name": "permute_796",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_796",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wv",
+      "name": "dtype_cast_415",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_415",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wv",
+      "name": "alias_default_1586",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1132",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "alias_default_1127",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1127",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_511",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wk",
+      "name": "einsum_default_419",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_513",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wk",
+      "name": "permute_799",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1127",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_799",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wk",
+      "name": "einsum_default_420",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_418",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_420",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "add_256",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_419",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wk",
+      "name": "permute_800",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_800",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wk",
+      "name": "dtype_cast_416",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_416",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wk",
+      "name": "alias_default_1585",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1133",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention",
+      "name": "alias_default_1128",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1128",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_511",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wq",
+      "name": "einsum_default_421",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_512",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wq",
+      "name": "permute_803",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1128",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_803",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wq",
+      "name": "einsum_default_422",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_256",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_422",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18",
+      "name": "add_257",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_421",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wq",
+      "name": "permute_804",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_804",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wq",
+      "name": "dtype_cast_417",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_417",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention.wq",
+      "name": "alias_default_1584",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_257",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "convert_element_type_1406",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_507",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "convert_element_type_1407",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_508",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "convert_element_type_1408",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1406",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "alias_default_1129",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1129",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1408",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "mul_506",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1407",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_510",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "mul_507",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_506",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "alias_default_1130",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_507",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "alias_default_1131",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1131",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1130",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "mul_508",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_508",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "sum_85",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1131",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "div_60",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_60",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_85",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "mul_509",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1130",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_509",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "sub_42",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_42",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_510",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "mul_510",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1129",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1131",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "mul_511",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_511",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "sum_86",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_510",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "convert_element_type_1409",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_86",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "convert_element_type_1410",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1125",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1409",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "add_258",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1410",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "dtype_cast_418",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_418",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.18.attention_norm",
+      "name": "alias_default_1591",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_258",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "alias_default_1132",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1132",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_505",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "einsum_default_423",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_506",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "permute_807",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1132",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_807",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "einsum_default_424",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_423",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "permute_808",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_808",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "dtype_cast_419",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_419",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "alias_default_1580",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_424",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w2",
+      "name": "alias_default_1133",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1133",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_502",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "mul_512",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1133",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_504",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "mul_513",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_512",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "alias_default_1134",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1134",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_498",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "einsum_default_425",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_503",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "permute_811",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1134",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_811",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "einsum_default_426",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_425",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "permute_812",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_812",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "dtype_cast_420",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_420",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w3",
+      "name": "alias_default_1581",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_513",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "convert_element_type_1419",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_500",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "convert_element_type_1420",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1420",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "alias_default_1135",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1135",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "neg_46",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_46",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "exp_46",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_46",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "add_259",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_259",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "reciprocal_14",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_14",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "mul_514",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_514",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "alias_default_1136",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1419",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1136",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "mul_515",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1136",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "sub_43",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1135",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "mul_516",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_516",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "add_260",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_515",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_260",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "mul_517",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_517",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "convert_element_type_1421",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1421",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward",
+      "name": "alias_default_1137",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1137",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_498",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "einsum_default_427",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_499",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "permute_815",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1137",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_815",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "einsum_default_428",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_426",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_428",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17",
+      "name": "add_261",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_427",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "permute_816",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_816",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "dtype_cast_421",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_421",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.feed_forward.w1",
+      "name": "alias_default_1579",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_261",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "convert_element_type_1426",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_494",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "convert_element_type_1427",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_495",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "convert_element_type_1428",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1426",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "alias_default_1138",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1138",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1428",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "mul_518",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1427",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_497",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "mul_519",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_518",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "alias_default_1139",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_519",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "alias_default_1140",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1140",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1139",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "mul_520",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_520",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "sum_87",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1140",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "div_61",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_61",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_87",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "mul_521",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1139",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_521",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "sub_44",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_44",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_497",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "mul_522",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1138",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1140",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "mul_523",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_523",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "sum_88",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_522",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "convert_element_type_1429",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_88",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "convert_element_type_1430",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1132",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1429",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "add_262",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1430",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "dtype_cast_422",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_422",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.ffn_norm",
+      "name": "alias_default_1583",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_262",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "alias_default_1141",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1141",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_492",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "einsum_default_429",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_493",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "permute_819",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1141",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_819",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "einsum_default_430",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_429",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "permute_820",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_820",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "dtype_cast_423",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_423",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wo",
+      "name": "alias_default_1578",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_430",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_1148",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1148",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "permute_821",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_821",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_488",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_489",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_490",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_491",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_154",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_159",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_160",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_14",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.sdpa",
+      "name": "getitem_330",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.sdpa",
+      "name": "getitem_331",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.sdpa",
+      "name": "getitem_332",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_332",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "permute_822",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_331",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "permute_823",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_330",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "permute_824",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_822",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_1149",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1149",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "sum_89",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_89",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "squeeze_28",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_823",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_1150",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1150",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "sum_90",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_90",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "squeeze_29",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "convert_element_type_1435",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_824",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "convert_element_type_1436",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1435",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_1151",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1151",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_as_complex_92",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_487",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "_conj_28",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_28",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "clone_182",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_92",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_182",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "mul_524",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1436",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_1152",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1152",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_as_complex_93",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_487",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "_conj_29",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_29",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "clone_183",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_93",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_183",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "mul_525",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_524",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_as_real_92",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_92",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_1153",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1153",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "convert_element_type_1437",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_525",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_as_real_93",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_93",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_1154",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1154",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "convert_element_type_1438",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_1155",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1437",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_1156",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1438",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "view_1157",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1155",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "alias_default_1142",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1142",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_483",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wv",
+      "name": "einsum_default_431",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_486",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wv",
+      "name": "permute_827",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1142",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_827",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wv",
+      "name": "einsum_default_432",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_431",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wv",
+      "name": "permute_828",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_828",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wv",
+      "name": "dtype_cast_424",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_424",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wv",
+      "name": "alias_default_1577",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1156",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "alias_default_1143",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1143",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_483",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wk",
+      "name": "einsum_default_433",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_485",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wk",
+      "name": "permute_831",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1143",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_831",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wk",
+      "name": "einsum_default_434",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_432",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_434",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "add_263",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_433",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wk",
+      "name": "permute_832",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_832",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wk",
+      "name": "dtype_cast_425",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_425",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wk",
+      "name": "alias_default_1576",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1157",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention",
+      "name": "alias_default_1144",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1144",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_483",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wq",
+      "name": "einsum_default_435",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_484",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wq",
+      "name": "permute_835",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1144",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_835",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wq",
+      "name": "einsum_default_436",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_263",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_436",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17",
+      "name": "add_264",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_435",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wq",
+      "name": "permute_836",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_836",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wq",
+      "name": "dtype_cast_426",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_426",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention.wq",
+      "name": "alias_default_1575",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_264",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "convert_element_type_1451",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_479",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "convert_element_type_1452",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_480",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "convert_element_type_1453",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1451",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "alias_default_1145",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1145",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1453",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "mul_526",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1452",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_482",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "mul_527",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_526",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "alias_default_1146",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_527",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "alias_default_1147",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1147",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1146",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "mul_528",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_528",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "sum_91",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1147",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "div_62",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_62",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_91",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "mul_529",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1146",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_529",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "sub_45",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_45",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_482",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "mul_530",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1145",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1147",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "mul_531",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_531",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "sum_92",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_530",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "convert_element_type_1454",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_92",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "convert_element_type_1455",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1141",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1454",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "add_265",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1455",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "dtype_cast_427",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_427",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.17.attention_norm",
+      "name": "alias_default_1582",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_265",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "alias_default_1148",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1148",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_477",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "einsum_default_437",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_478",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "permute_839",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1148",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_839",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "einsum_default_438",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_437",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "permute_840",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_840",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "dtype_cast_428",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_428",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "alias_default_1571",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_438",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w2",
+      "name": "alias_default_1149",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1149",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_474",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "mul_532",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1149",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_476",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "mul_533",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_532",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "alias_default_1150",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1150",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_470",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "einsum_default_439",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_475",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "permute_843",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1150",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_843",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "einsum_default_440",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_439",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "permute_844",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_844",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "dtype_cast_429",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_429",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w3",
+      "name": "alias_default_1572",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_533",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "convert_element_type_1464",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_472",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "convert_element_type_1465",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1465",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "alias_default_1151",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1151",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "neg_47",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_47",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "exp_47",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_47",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "add_266",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_266",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "reciprocal_15",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_15",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "mul_534",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_534",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "alias_default_1152",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1464",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1152",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "mul_535",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1152",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "sub_46",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1151",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_46",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "mul_536",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_536",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "add_267",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_535",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_267",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "mul_537",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_537",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "convert_element_type_1466",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1466",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward",
+      "name": "alias_default_1153",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1153",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_470",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "einsum_default_441",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_471",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "permute_847",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1153",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_847",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "einsum_default_442",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_440",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_442",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16",
+      "name": "add_268",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_441",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "permute_848",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_848",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "dtype_cast_430",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_430",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.feed_forward.w1",
+      "name": "alias_default_1570",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_268",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "convert_element_type_1471",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_466",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "convert_element_type_1472",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_467",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "convert_element_type_1473",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1471",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "alias_default_1154",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1154",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1473",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "mul_538",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1472",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_469",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "mul_539",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_538",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "alias_default_1155",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_539",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "alias_default_1156",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1156",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1155",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "mul_540",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_540",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "sum_93",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1156",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "div_63",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_63",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_93",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "mul_541",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1155",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_541",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "sub_47",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_47",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_469",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "mul_542",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1154",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1156",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "mul_543",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_543",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "sum_94",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_542",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "convert_element_type_1474",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_94",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "convert_element_type_1475",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1148",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1474",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "add_269",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1475",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "dtype_cast_431",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_431",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.ffn_norm",
+      "name": "alias_default_1574",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_269",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "alias_default_1157",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1157",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_464",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "einsum_default_443",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_465",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "permute_851",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1157",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_851",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "einsum_default_444",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_443",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "permute_852",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_852",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "dtype_cast_432",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_432",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wo",
+      "name": "alias_default_1569",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_444",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_1172",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1172",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "permute_853",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_853",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_460",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_461",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_462",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_463",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_145",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_150",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_151",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_15",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.sdpa",
+      "name": "getitem_333",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.sdpa",
+      "name": "getitem_334",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.sdpa",
+      "name": "getitem_335",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_335",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "permute_854",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_334",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "permute_855",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_333",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "permute_856",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_854",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_1173",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1173",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "sum_95",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_95",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "squeeze_30",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_855",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_1174",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1174",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "sum_96",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_96",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "squeeze_31",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "convert_element_type_1480",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_856",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "convert_element_type_1481",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1480",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_1175",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1175",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_as_complex_94",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_459",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "_conj_30",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_30",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "clone_190",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_94",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_190",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "mul_544",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1481",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_1176",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1176",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_as_complex_95",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_459",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "_conj_31",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_31",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "clone_191",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_95",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_191",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "mul_545",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_544",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_as_real_94",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_94",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_1177",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1177",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "convert_element_type_1482",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_545",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_as_real_95",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_95",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_1178",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1178",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "convert_element_type_1483",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_1179",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1482",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_1180",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1483",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "view_1181",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1179",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "alias_default_1158",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1158",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_455",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wv",
+      "name": "einsum_default_445",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_458",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wv",
+      "name": "permute_859",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1158",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_859",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wv",
+      "name": "einsum_default_446",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_445",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wv",
+      "name": "permute_860",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_860",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wv",
+      "name": "dtype_cast_433",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_433",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wv",
+      "name": "alias_default_1568",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1180",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "alias_default_1159",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1159",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_455",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wk",
+      "name": "einsum_default_447",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_457",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wk",
+      "name": "permute_863",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1159",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_863",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wk",
+      "name": "einsum_default_448",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_446",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_448",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "add_270",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_447",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wk",
+      "name": "permute_864",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_864",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wk",
+      "name": "dtype_cast_434",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_434",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wk",
+      "name": "alias_default_1567",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1181",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention",
+      "name": "alias_default_1160",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1160",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_455",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wq",
+      "name": "einsum_default_449",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_456",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wq",
+      "name": "permute_867",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1160",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_867",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wq",
+      "name": "einsum_default_450",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_270",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_450",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16",
+      "name": "add_271",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_449",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wq",
+      "name": "permute_868",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_868",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wq",
+      "name": "dtype_cast_435",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_435",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention.wq",
+      "name": "alias_default_1566",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_271",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "convert_element_type_1496",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_451",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "convert_element_type_1497",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_452",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "convert_element_type_1498",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1496",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "alias_default_1161",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1161",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1498",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "mul_546",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1497",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_454",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "mul_547",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_546",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "alias_default_1162",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_547",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "alias_default_1163",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1163",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1162",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "mul_548",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_548",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "sum_97",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1163",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "div_64",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_64",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_97",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "mul_549",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1162",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_549",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "sub_48",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_48",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_454",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "mul_550",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1161",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1163",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "mul_551",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_551",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "sum_98",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_550",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "convert_element_type_1499",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_98",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "convert_element_type_1500",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1157",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1499",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "add_272",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1500",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "dtype_cast_436",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_436",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.16.attention_norm",
+      "name": "alias_default_1573",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_272",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "alias_default_1164",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1164",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_449",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "einsum_default_451",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_450",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "permute_871",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1164",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_871",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "einsum_default_452",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_451",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "permute_872",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_872",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "dtype_cast_437",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_437",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "alias_default_1562",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_452",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w2",
+      "name": "alias_default_1165",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1165",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_446",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "mul_552",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1165",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_448",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "mul_553",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_552",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "alias_default_1166",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1166",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_442",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "einsum_default_453",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_447",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "permute_875",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1166",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_875",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "einsum_default_454",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_453",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "permute_876",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_876",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "dtype_cast_438",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_438",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w3",
+      "name": "alias_default_1563",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_553",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "convert_element_type_1509",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_444",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "convert_element_type_1510",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1510",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "alias_default_1167",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1167",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "neg_48",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "exp_48",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "add_273",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_273",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "reciprocal_16",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_16",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "mul_554",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_554",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "alias_default_1168",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1509",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1168",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "mul_555",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1168",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "sub_49",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1167",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_49",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "mul_556",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_556",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "add_274",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_555",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_274",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "mul_557",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_557",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "convert_element_type_1511",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1511",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward",
+      "name": "alias_default_1169",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1169",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_442",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "einsum_default_455",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_443",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "permute_879",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1169",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_879",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "einsum_default_456",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_454",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_456",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15",
+      "name": "add_275",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_455",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "permute_880",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_880",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "dtype_cast_439",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_439",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.feed_forward.w1",
+      "name": "alias_default_1561",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_275",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "convert_element_type_1516",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_438",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "convert_element_type_1517",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_439",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "convert_element_type_1518",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1516",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "alias_default_1170",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1170",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1518",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "mul_558",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1517",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_441",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "mul_559",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_558",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "alias_default_1171",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_559",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "alias_default_1172",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1172",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "mul_560",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_560",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "sum_99",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1172",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "div_65",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_65",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_99",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "mul_561",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_561",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "sub_50",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_50",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_441",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "mul_562",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1170",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1172",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "mul_563",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_563",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "sum_100",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_562",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "convert_element_type_1519",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_100",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "convert_element_type_1520",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1164",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1519",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "add_276",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1520",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "dtype_cast_440",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_440",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.ffn_norm",
+      "name": "alias_default_1565",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_276",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "alias_default_1173",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1173",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_436",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "einsum_default_457",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_437",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "permute_883",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1173",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_883",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "einsum_default_458",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_457",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "permute_884",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_884",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "dtype_cast_441",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_441",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wo",
+      "name": "alias_default_1560",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_458",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_1196",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1196",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "permute_885",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_885",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_432",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_433",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_434",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_435",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_136",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_141",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_142",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_16",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.sdpa",
+      "name": "getitem_336",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.sdpa",
+      "name": "getitem_337",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.sdpa",
+      "name": "getitem_338",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_338",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "permute_886",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_337",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "permute_887",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_336",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "permute_888",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_886",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_1197",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1197",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "sum_101",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_101",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "squeeze_32",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_887",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_1198",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1198",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "sum_102",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_102",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "squeeze_33",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_33",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "convert_element_type_1525",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_888",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "convert_element_type_1526",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1525",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_1199",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1199",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_as_complex_96",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_431",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "_conj_32",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_32",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "clone_198",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_96",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_198",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "mul_564",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1526",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_1200",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1200",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_as_complex_97",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_431",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "_conj_33",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_33",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "clone_199",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_97",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_199",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "mul_565",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_564",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_as_real_96",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_96",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_1201",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1201",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "convert_element_type_1527",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_565",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_as_real_97",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_97",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_1202",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1202",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "convert_element_type_1528",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_32",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_1203",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1527",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_1204",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1528",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "view_1205",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1203",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "alias_default_1174",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1174",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_427",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wv",
+      "name": "einsum_default_459",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_430",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wv",
+      "name": "permute_891",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1174",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_891",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wv",
+      "name": "einsum_default_460",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_459",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wv",
+      "name": "permute_892",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_892",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wv",
+      "name": "dtype_cast_442",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_442",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wv",
+      "name": "alias_default_1559",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1204",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "alias_default_1175",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1175",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_427",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wk",
+      "name": "einsum_default_461",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_429",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wk",
+      "name": "permute_895",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1175",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_895",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wk",
+      "name": "einsum_default_462",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_460",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_462",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "add_277",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_461",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wk",
+      "name": "permute_896",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_896",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wk",
+      "name": "dtype_cast_443",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_443",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wk",
+      "name": "alias_default_1558",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1205",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention",
+      "name": "alias_default_1176",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1176",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_427",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wq",
+      "name": "einsum_default_463",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_428",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wq",
+      "name": "permute_899",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1176",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_899",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wq",
+      "name": "einsum_default_464",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_277",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_464",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15",
+      "name": "add_278",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_463",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wq",
+      "name": "permute_900",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_900",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wq",
+      "name": "dtype_cast_444",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_444",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention.wq",
+      "name": "alias_default_1557",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_278",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "convert_element_type_1541",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_423",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "convert_element_type_1542",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_424",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "convert_element_type_1543",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1541",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "alias_default_1177",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1177",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1543",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "mul_566",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1542",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_426",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "mul_567",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_566",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "alias_default_1178",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_567",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "alias_default_1179",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1179",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1178",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "mul_568",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_568",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "sum_103",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1179",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "div_66",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_66",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_103",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "mul_569",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1178",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_569",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "sub_51",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_51",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_426",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "mul_570",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1177",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1179",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "mul_571",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_571",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "sum_104",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_570",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "convert_element_type_1544",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_104",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "convert_element_type_1545",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1173",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1544",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "add_279",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1545",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "dtype_cast_445",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_445",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.15.attention_norm",
+      "name": "alias_default_1564",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_279",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "alias_default_1180",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1180",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_421",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "einsum_default_465",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_422",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "permute_903",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1180",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_903",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "einsum_default_466",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_465",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "permute_904",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_904",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "dtype_cast_446",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_446",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "alias_default_1553",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_466",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w2",
+      "name": "alias_default_1181",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1181",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_418",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "mul_572",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1181",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_420",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "mul_573",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_572",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "alias_default_1182",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1182",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_414",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "einsum_default_467",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_419",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "permute_907",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1182",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_907",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "einsum_default_468",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_467",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "permute_908",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_908",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "dtype_cast_447",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_447",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w3",
+      "name": "alias_default_1554",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_573",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "convert_element_type_1554",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_416",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "convert_element_type_1555",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1555",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "alias_default_1183",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1183",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "neg_49",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_49",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "exp_49",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_49",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "add_280",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_280",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "reciprocal_17",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_17",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "mul_574",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_574",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "alias_default_1184",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1554",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1184",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "mul_575",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1184",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "sub_52",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1183",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "mul_576",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_576",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "add_281",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_575",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_281",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "mul_577",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_577",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "convert_element_type_1556",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1556",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward",
+      "name": "alias_default_1185",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1185",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_414",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "einsum_default_469",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_415",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "permute_911",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1185",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_911",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "einsum_default_470",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_468",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_470",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14",
+      "name": "add_282",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_469",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "permute_912",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_912",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "dtype_cast_448",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_448",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.feed_forward.w1",
+      "name": "alias_default_1552",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_282",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "convert_element_type_1561",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_410",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "convert_element_type_1562",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_411",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "convert_element_type_1563",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1561",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "alias_default_1186",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1186",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1563",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "mul_578",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1562",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_413",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "mul_579",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_578",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "alias_default_1187",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_579",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "alias_default_1188",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1188",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1187",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "mul_580",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_580",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "sum_105",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1188",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "div_67",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_67",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_105",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "mul_581",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1187",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_581",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "sub_53",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_53",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_413",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "mul_582",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1186",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1188",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "mul_583",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_583",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "sum_106",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_582",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "convert_element_type_1564",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_106",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "convert_element_type_1565",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1180",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1564",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "add_283",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1565",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "dtype_cast_449",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_449",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.ffn_norm",
+      "name": "alias_default_1556",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_283",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "alias_default_1189",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1189",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_408",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "einsum_default_471",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_409",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "permute_915",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1189",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_915",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "einsum_default_472",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_471",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "permute_916",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_916",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "dtype_cast_450",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_450",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wo",
+      "name": "alias_default_1551",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_472",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_1220",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1220",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "permute_917",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_917",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_404",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_405",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_406",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_407",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_127",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_132",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_133",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_17",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.sdpa",
+      "name": "getitem_339",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.sdpa",
+      "name": "getitem_340",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_17",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.sdpa",
+      "name": "getitem_341",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_341",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "permute_918",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_340",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "permute_919",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_339",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "permute_920",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_918",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_1221",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1221",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "sum_107",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_107",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "squeeze_34",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_919",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_1222",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1222",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "sum_108",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_108",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "squeeze_35",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_35",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "convert_element_type_1570",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_920",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "convert_element_type_1571",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1570",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_1223",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1223",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_as_complex_98",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_403",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "_conj_34",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_34",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "clone_206",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_98",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_206",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "mul_584",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1571",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_1224",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1224",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_as_complex_99",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_403",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "_conj_35",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_35",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "clone_207",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_99",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_207",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "mul_585",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_584",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_as_real_98",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_98",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_1225",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1225",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "convert_element_type_1572",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_585",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_as_real_99",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_99",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_1226",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1226",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "convert_element_type_1573",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_34",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_1227",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1572",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_1228",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1573",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "view_1229",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1227",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "alias_default_1190",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1190",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_399",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wv",
+      "name": "einsum_default_473",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_402",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wv",
+      "name": "permute_923",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1190",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_923",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wv",
+      "name": "einsum_default_474",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_473",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wv",
+      "name": "permute_924",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_924",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wv",
+      "name": "dtype_cast_451",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_451",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wv",
+      "name": "alias_default_1550",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1228",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "alias_default_1191",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1191",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_399",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wk",
+      "name": "einsum_default_475",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_401",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wk",
+      "name": "permute_927",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1191",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_927",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wk",
+      "name": "einsum_default_476",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_474",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_476",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "add_284",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_475",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wk",
+      "name": "permute_928",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_928",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wk",
+      "name": "dtype_cast_452",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_452",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wk",
+      "name": "alias_default_1549",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1229",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention",
+      "name": "alias_default_1192",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1192",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_399",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wq",
+      "name": "einsum_default_477",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_400",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wq",
+      "name": "permute_931",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1192",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_931",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wq",
+      "name": "einsum_default_478",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_284",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_478",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14",
+      "name": "add_285",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_477",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wq",
+      "name": "permute_932",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_932",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wq",
+      "name": "dtype_cast_453",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_453",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention.wq",
+      "name": "alias_default_1548",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_285",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "convert_element_type_1586",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_395",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "convert_element_type_1587",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_396",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "convert_element_type_1588",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1586",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "alias_default_1193",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1193",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1588",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "mul_586",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1587",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_398",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "mul_587",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_586",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "alias_default_1194",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_587",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "alias_default_1195",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1195",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1194",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "mul_588",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_588",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "sum_109",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1195",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "div_68",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_68",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_109",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "mul_589",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1194",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_589",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "sub_54",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_54",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_398",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "mul_590",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1193",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1195",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "mul_591",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_591",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "sum_110",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_590",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "convert_element_type_1589",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_110",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "convert_element_type_1590",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1189",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1589",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "add_286",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1590",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "dtype_cast_454",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_454",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.14.attention_norm",
+      "name": "alias_default_1555",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_286",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "alias_default_1196",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1196",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_393",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "einsum_default_479",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_394",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "permute_935",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1196",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_935",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "einsum_default_480",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_479",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "permute_936",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_936",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "dtype_cast_455",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_455",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "alias_default_1544",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_480",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w2",
+      "name": "alias_default_1197",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1197",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_390",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "mul_592",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1197",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_392",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "mul_593",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_592",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "alias_default_1198",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1198",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_386",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "einsum_default_481",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_391",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "permute_939",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1198",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_939",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "einsum_default_482",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_481",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "permute_940",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_940",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "dtype_cast_456",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_456",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w3",
+      "name": "alias_default_1545",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_593",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "convert_element_type_1599",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_388",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "convert_element_type_1600",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1600",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "alias_default_1199",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1199",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "neg_50",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_50",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "exp_50",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_50",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "add_287",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_287",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "reciprocal_18",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_18",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "mul_594",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_594",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "alias_default_1200",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1599",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1200",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "mul_595",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1200",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "sub_55",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1199",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_55",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "mul_596",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_596",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "add_288",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_595",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_288",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "mul_597",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_597",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "convert_element_type_1601",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1601",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward",
+      "name": "alias_default_1201",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1201",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_386",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "einsum_default_483",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_387",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "permute_943",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1201",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_943",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "einsum_default_484",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_482",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_484",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13",
+      "name": "add_289",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_483",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "permute_944",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_944",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "dtype_cast_457",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_457",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.feed_forward.w1",
+      "name": "alias_default_1543",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_289",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "convert_element_type_1606",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_382",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "convert_element_type_1607",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_383",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "convert_element_type_1608",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1606",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "alias_default_1202",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1202",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1608",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "mul_598",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1607",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_385",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "mul_599",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_598",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "alias_default_1203",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_599",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "alias_default_1204",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1204",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1203",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "mul_600",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_600",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "sum_111",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1204",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "div_69",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_69",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_111",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "mul_601",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1203",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_601",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "sub_56",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_56",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_385",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "mul_602",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1202",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1204",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "mul_603",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_603",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "sum_112",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_602",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "convert_element_type_1609",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_112",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "convert_element_type_1610",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1196",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1609",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "add_290",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1610",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "dtype_cast_458",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_458",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.ffn_norm",
+      "name": "alias_default_1547",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_290",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "alias_default_1205",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1205",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_380",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "einsum_default_485",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_381",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "permute_947",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1205",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_947",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "einsum_default_486",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_485",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "permute_948",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_948",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "dtype_cast_459",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_459",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wo",
+      "name": "alias_default_1542",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_486",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_1244",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1244",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "permute_949",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_949",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_376",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_377",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_378",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_379",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_118",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_123",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_124",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_18",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.sdpa",
+      "name": "getitem_342",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.sdpa",
+      "name": "getitem_343",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.sdpa",
+      "name": "getitem_344",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_344",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "permute_950",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_343",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "permute_951",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_342",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "permute_952",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_950",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_1245",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1245",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "sum_113",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_113",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "squeeze_36",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_951",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_1246",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1246",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "sum_114",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_114",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "squeeze_37",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_37",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "convert_element_type_1615",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_952",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "convert_element_type_1616",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1615",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_1247",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1247",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_as_complex_100",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_375",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "_conj_36",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_36",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "clone_214",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_100",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_214",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "mul_604",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1616",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_1248",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1248",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_as_complex_101",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_375",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "_conj_37",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_37",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "clone_215",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_101",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_215",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "mul_605",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_604",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_as_real_100",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_100",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_1249",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1249",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "convert_element_type_1617",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_605",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_as_real_101",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_101",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_1250",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1250",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "convert_element_type_1618",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_36",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_1251",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1617",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_1252",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1618",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "view_1253",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1251",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "alias_default_1206",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1206",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_371",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wv",
+      "name": "einsum_default_487",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_374",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wv",
+      "name": "permute_955",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1206",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_955",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wv",
+      "name": "einsum_default_488",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_487",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wv",
+      "name": "permute_956",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_956",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wv",
+      "name": "dtype_cast_460",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_460",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wv",
+      "name": "alias_default_1541",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1252",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "alias_default_1207",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1207",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_371",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wk",
+      "name": "einsum_default_489",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_373",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wk",
+      "name": "permute_959",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1207",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_959",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wk",
+      "name": "einsum_default_490",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_488",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_490",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "add_291",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_489",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wk",
+      "name": "permute_960",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_960",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wk",
+      "name": "dtype_cast_461",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_461",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wk",
+      "name": "alias_default_1540",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1253",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention",
+      "name": "alias_default_1208",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1208",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_371",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wq",
+      "name": "einsum_default_491",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_372",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wq",
+      "name": "permute_963",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1208",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_963",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wq",
+      "name": "einsum_default_492",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_291",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_492",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13",
+      "name": "add_292",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_491",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wq",
+      "name": "permute_964",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_964",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wq",
+      "name": "dtype_cast_462",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_462",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention.wq",
+      "name": "alias_default_1539",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_292",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "convert_element_type_1631",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_367",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "convert_element_type_1632",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_368",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "convert_element_type_1633",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1631",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "alias_default_1209",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1209",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1633",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "mul_606",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1632",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_370",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "mul_607",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_606",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "alias_default_1210",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_607",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "alias_default_1211",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1211",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1210",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "mul_608",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_608",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "sum_115",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1211",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "div_70",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_70",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "mul_609",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1210",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_609",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "sub_57",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_57",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_370",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "mul_610",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1209",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1211",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "mul_611",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_611",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "sum_116",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_610",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "convert_element_type_1634",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_116",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "convert_element_type_1635",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1205",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1634",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "add_293",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1635",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "dtype_cast_463",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_463",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.13.attention_norm",
+      "name": "alias_default_1546",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_293",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "alias_default_1212",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1212",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_365",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "einsum_default_493",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_366",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "permute_967",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1212",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_967",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "einsum_default_494",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_493",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "permute_968",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_968",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "dtype_cast_464",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_464",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "alias_default_1535",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_494",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w2",
+      "name": "alias_default_1213",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1213",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_362",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "mul_612",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1213",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_364",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "mul_613",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_612",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "alias_default_1214",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1214",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_358",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "einsum_default_495",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_363",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "permute_971",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1214",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_971",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "einsum_default_496",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_495",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "permute_972",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_972",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "dtype_cast_465",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_465",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w3",
+      "name": "alias_default_1536",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_613",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "convert_element_type_1644",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_360",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "convert_element_type_1645",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1645",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "alias_default_1215",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1215",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "neg_51",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_51",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "exp_51",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_51",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "add_294",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_294",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "reciprocal_19",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_19",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "mul_614",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_614",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "alias_default_1216",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1644",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1216",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "mul_615",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1216",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "sub_58",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1215",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "mul_616",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_616",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "add_295",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_615",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_295",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "mul_617",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_617",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "convert_element_type_1646",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1646",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward",
+      "name": "alias_default_1217",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1217",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_358",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "einsum_default_497",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_359",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "permute_975",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1217",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_975",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "einsum_default_498",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_496",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_498",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12",
+      "name": "add_296",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_497",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "permute_976",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_976",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "dtype_cast_466",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_466",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.feed_forward.w1",
+      "name": "alias_default_1534",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_296",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "convert_element_type_1651",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_354",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "convert_element_type_1652",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_355",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "convert_element_type_1653",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1651",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "alias_default_1218",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1218",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1653",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "mul_618",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1652",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_357",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "mul_619",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_618",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "alias_default_1219",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_619",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "alias_default_1220",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1220",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1219",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "mul_620",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_620",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "sum_117",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1220",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "div_71",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_71",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_117",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "mul_621",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1219",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_621",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "sub_59",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_357",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "mul_622",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1218",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1220",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "mul_623",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_623",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "sum_118",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_622",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "convert_element_type_1654",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_118",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "convert_element_type_1655",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1212",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1654",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "add_297",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1655",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "dtype_cast_467",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_467",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.ffn_norm",
+      "name": "alias_default_1538",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_297",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "alias_default_1221",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1221",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_352",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "einsum_default_499",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_353",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "permute_979",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1221",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_979",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "einsum_default_500",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_499",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "permute_980",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_980",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "dtype_cast_468",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_468",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wo",
+      "name": "alias_default_1533",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_500",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_1268",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1268",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "permute_981",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_981",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_348",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_349",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_350",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_351",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_109",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_114",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_115",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_19",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.sdpa",
+      "name": "getitem_345",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.sdpa",
+      "name": "getitem_346",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.sdpa",
+      "name": "getitem_347",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_347",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "permute_982",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_346",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "permute_983",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_345",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "permute_984",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_982",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_1269",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1269",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "sum_119",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_119",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "squeeze_38",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_983",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_1270",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1270",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "sum_120",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_120",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "squeeze_39",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_39",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "convert_element_type_1660",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_984",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "convert_element_type_1661",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1660",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_1271",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1271",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_as_complex_102",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_347",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "_conj_38",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_38",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "clone_222",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_102",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_222",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "mul_624",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1661",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_1272",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1272",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_as_complex_103",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_347",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "_conj_39",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_39",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "clone_223",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_103",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_223",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "mul_625",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_624",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_as_real_102",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_102",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_1273",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1273",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "convert_element_type_1662",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_625",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_as_real_103",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_103",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_1274",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1274",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "convert_element_type_1663",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_38",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_1275",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1662",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_1276",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1663",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "view_1277",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1275",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "alias_default_1222",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1222",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_343",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wv",
+      "name": "einsum_default_501",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_346",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wv",
+      "name": "permute_987",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1222",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_987",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wv",
+      "name": "einsum_default_502",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_501",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wv",
+      "name": "permute_988",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_988",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wv",
+      "name": "dtype_cast_469",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_469",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wv",
+      "name": "alias_default_1532",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1276",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "alias_default_1223",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1223",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_343",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wk",
+      "name": "einsum_default_503",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_345",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wk",
+      "name": "permute_991",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1223",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_991",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wk",
+      "name": "einsum_default_504",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_502",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_504",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "add_298",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_503",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wk",
+      "name": "permute_992",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_992",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wk",
+      "name": "dtype_cast_470",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_470",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wk",
+      "name": "alias_default_1531",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1277",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention",
+      "name": "alias_default_1224",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1224",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_343",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wq",
+      "name": "einsum_default_505",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_344",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wq",
+      "name": "permute_995",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1224",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_995",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wq",
+      "name": "einsum_default_506",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_298",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_506",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12",
+      "name": "add_299",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_505",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wq",
+      "name": "permute_996",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_996",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wq",
+      "name": "dtype_cast_471",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_471",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention.wq",
+      "name": "alias_default_1530",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_299",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "convert_element_type_1676",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_339",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "convert_element_type_1677",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_340",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "convert_element_type_1678",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1676",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "alias_default_1225",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1225",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1678",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "mul_626",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1677",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_342",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "mul_627",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_626",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "alias_default_1226",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_627",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "alias_default_1227",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1227",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1226",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "mul_628",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_628",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "sum_121",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1227",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "div_72",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_72",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_121",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "mul_629",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1226",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_629",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "sub_60",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_60",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_342",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "mul_630",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1225",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1227",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "mul_631",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_631",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "sum_122",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_630",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "convert_element_type_1679",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_122",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "convert_element_type_1680",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1221",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1679",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "add_300",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1680",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "dtype_cast_472",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_472",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.12.attention_norm",
+      "name": "alias_default_1537",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_300",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "alias_default_1228",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1228",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_337",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "einsum_default_507",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_338",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "permute_999",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1228",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_999",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "einsum_default_508",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_507",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "permute_1000",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1000",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "dtype_cast_473",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_473",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "alias_default_1526",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_508",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w2",
+      "name": "alias_default_1229",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1229",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_334",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "mul_632",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1229",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_336",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "mul_633",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_632",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "alias_default_1230",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1230",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_330",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "einsum_default_509",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_335",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "permute_1003",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1230",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1003",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "einsum_default_510",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_509",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "permute_1004",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1004",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "dtype_cast_474",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_474",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w3",
+      "name": "alias_default_1527",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_633",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "convert_element_type_1689",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_332",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "convert_element_type_1690",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1690",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "alias_default_1231",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1231",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "neg_52",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "exp_52",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "add_301",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_301",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "reciprocal_20",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_20",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "mul_634",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_634",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "alias_default_1232",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1689",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1232",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "mul_635",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1232",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "sub_61",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1231",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "mul_636",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_636",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "add_302",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_635",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_302",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "mul_637",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_637",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "convert_element_type_1691",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1691",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward",
+      "name": "alias_default_1233",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1233",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_330",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "einsum_default_511",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_331",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "permute_1007",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1233",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1007",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "einsum_default_512",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_510",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_512",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11",
+      "name": "add_303",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_511",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "permute_1008",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1008",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "dtype_cast_475",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_475",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.feed_forward.w1",
+      "name": "alias_default_1525",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_303",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "convert_element_type_1696",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_326",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "convert_element_type_1697",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_327",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "convert_element_type_1698",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1696",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "alias_default_1234",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1234",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1698",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "mul_638",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1697",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_329",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "mul_639",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_638",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "alias_default_1235",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_639",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "alias_default_1236",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1236",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1235",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "mul_640",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_640",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "sum_123",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1236",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "div_73",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_73",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_123",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "mul_641",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1235",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_641",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "sub_62",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_62",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_329",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "mul_642",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1234",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1236",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "mul_643",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_643",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "sum_124",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_642",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "convert_element_type_1699",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_124",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "convert_element_type_1700",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1228",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1699",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "add_304",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1700",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "dtype_cast_476",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_476",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.ffn_norm",
+      "name": "alias_default_1529",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_304",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "alias_default_1237",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1237",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_324",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "einsum_default_513",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_325",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "permute_1011",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1237",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1011",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "einsum_default_514",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_513",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "permute_1012",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1012",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "dtype_cast_477",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_477",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wo",
+      "name": "alias_default_1524",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_514",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_1292",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1292",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "permute_1013",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1013",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_320",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_321",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_322",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_323",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_100",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_105",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_106",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_20",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.sdpa",
+      "name": "getitem_348",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.sdpa",
+      "name": "getitem_349",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_20",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.sdpa",
+      "name": "getitem_350",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_350",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "permute_1014",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_349",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "permute_1015",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_348",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "permute_1016",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1014",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_1293",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1293",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "sum_125",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_125",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "squeeze_40",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1015",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_1294",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1294",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "sum_126",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_126",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "squeeze_41",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_41",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "convert_element_type_1705",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1016",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "convert_element_type_1706",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1705",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_1295",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1295",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_as_complex_104",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_319",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "_conj_40",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_40",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "clone_230",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_104",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_230",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "mul_644",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1706",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_1296",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1296",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_as_complex_105",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_319",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "_conj_41",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_41",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "clone_231",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_105",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_231",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "mul_645",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_644",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_as_real_104",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_104",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_1297",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1297",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "convert_element_type_1707",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_645",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_as_real_105",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_105",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_1298",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1298",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "convert_element_type_1708",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_40",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_1299",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1707",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_1300",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1708",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "view_1301",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1299",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "alias_default_1238",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1238",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_315",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wv",
+      "name": "einsum_default_515",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_318",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wv",
+      "name": "permute_1019",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1238",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1019",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wv",
+      "name": "einsum_default_516",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_515",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wv",
+      "name": "permute_1020",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1020",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wv",
+      "name": "dtype_cast_478",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_478",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wv",
+      "name": "alias_default_1523",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1300",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "alias_default_1239",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1239",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_315",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wk",
+      "name": "einsum_default_517",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_317",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wk",
+      "name": "permute_1023",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1239",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1023",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wk",
+      "name": "einsum_default_518",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_516",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_518",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "add_305",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_517",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wk",
+      "name": "permute_1024",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1024",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wk",
+      "name": "dtype_cast_479",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_479",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wk",
+      "name": "alias_default_1522",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1301",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention",
+      "name": "alias_default_1240",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1240",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_315",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wq",
+      "name": "einsum_default_519",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_316",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wq",
+      "name": "permute_1027",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1240",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1027",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wq",
+      "name": "einsum_default_520",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_305",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_520",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11",
+      "name": "add_306",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_519",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wq",
+      "name": "permute_1028",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1028",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wq",
+      "name": "dtype_cast_480",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_480",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention.wq",
+      "name": "alias_default_1521",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_306",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "convert_element_type_1721",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_311",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "convert_element_type_1722",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_312",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "convert_element_type_1723",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1721",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "alias_default_1241",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1241",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1723",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "mul_646",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1722",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_314",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "mul_647",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_646",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "alias_default_1242",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_647",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "alias_default_1243",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1243",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1242",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "mul_648",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_648",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "sum_127",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1243",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "div_74",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_74",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_127",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "mul_649",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1242",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_649",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "sub_63",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_63",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_314",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "mul_650",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1241",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1243",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "mul_651",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_651",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "sum_128",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_650",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "convert_element_type_1724",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_128",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "convert_element_type_1725",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1237",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1724",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "add_307",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1725",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "dtype_cast_481",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_481",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.11.attention_norm",
+      "name": "alias_default_1528",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_307",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "alias_default_1244",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1244",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_309",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "einsum_default_521",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_310",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "permute_1031",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1244",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1031",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "einsum_default_522",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_521",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "permute_1032",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1032",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "dtype_cast_482",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_482",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "alias_default_1517",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_522",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w2",
+      "name": "alias_default_1245",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1245",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_306",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "mul_652",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1245",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_308",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "mul_653",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_652",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "alias_default_1246",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1246",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_302",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "einsum_default_523",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_307",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "permute_1035",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1246",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1035",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "einsum_default_524",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_523",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "permute_1036",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1036",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "dtype_cast_483",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_483",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w3",
+      "name": "alias_default_1518",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_653",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "convert_element_type_1734",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_304",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "convert_element_type_1735",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1735",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "alias_default_1247",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1247",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "neg_53",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "exp_53",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "add_308",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_308",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "reciprocal_21",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_21",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "mul_654",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_654",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "alias_default_1248",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1734",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1248",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "mul_655",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1248",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "sub_64",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1247",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_64",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "mul_656",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_656",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "add_309",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_655",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_309",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "mul_657",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_657",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "convert_element_type_1736",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1736",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward",
+      "name": "alias_default_1249",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1249",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_302",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "einsum_default_525",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_303",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "permute_1039",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1249",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1039",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "einsum_default_526",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_524",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_526",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10",
+      "name": "add_310",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_525",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "permute_1040",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1040",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "dtype_cast_484",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_484",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.feed_forward.w1",
+      "name": "alias_default_1516",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_310",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "convert_element_type_1741",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_298",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "convert_element_type_1742",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_299",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "convert_element_type_1743",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1741",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "alias_default_1250",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1250",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1743",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "mul_658",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1742",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_301",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "mul_659",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_658",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "alias_default_1251",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_659",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "alias_default_1252",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1252",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1251",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "mul_660",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_660",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "sum_129",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1252",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "div_75",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_75",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_129",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "mul_661",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1251",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_661",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "sub_65",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_65",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_301",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "mul_662",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1250",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1252",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "mul_663",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_663",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "sum_130",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_662",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "convert_element_type_1744",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_130",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "convert_element_type_1745",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1244",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1744",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "add_311",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1745",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "dtype_cast_485",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_485",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.ffn_norm",
+      "name": "alias_default_1520",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_311",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "alias_default_1253",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1253",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_296",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "einsum_default_527",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_297",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "permute_1043",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1253",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1043",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "einsum_default_528",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_527",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "permute_1044",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1044",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "dtype_cast_486",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_486",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wo",
+      "name": "alias_default_1515",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_528",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_1316",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1316",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "permute_1045",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1045",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_292",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_293",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_294",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_295",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_91",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_96",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_97",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_21",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.sdpa",
+      "name": "getitem_351",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.sdpa",
+      "name": "getitem_352",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.sdpa",
+      "name": "getitem_353",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_353",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "permute_1046",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_352",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "permute_1047",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_351",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "permute_1048",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1046",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_1317",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1317",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "sum_131",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_131",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "squeeze_42",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1047",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_1318",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1318",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "sum_132",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_132",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "squeeze_43",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_43",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "convert_element_type_1750",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1048",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "convert_element_type_1751",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1750",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_1319",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1319",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_as_complex_106",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_291",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "_conj_42",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_42",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "clone_238",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_106",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_238",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "mul_664",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1751",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_1320",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1320",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_as_complex_107",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_291",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "_conj_43",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_43",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "clone_239",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_107",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_239",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "mul_665",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_664",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_as_real_106",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_106",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_1321",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1321",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "convert_element_type_1752",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_665",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_as_real_107",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_107",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_1322",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1322",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "convert_element_type_1753",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_42",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_1323",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1752",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_1324",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1753",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "view_1325",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1323",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "alias_default_1254",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1254",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_287",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wv",
+      "name": "einsum_default_529",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_290",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wv",
+      "name": "permute_1051",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1254",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1051",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wv",
+      "name": "einsum_default_530",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_529",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wv",
+      "name": "permute_1052",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1052",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wv",
+      "name": "dtype_cast_487",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_487",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wv",
+      "name": "alias_default_1514",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1324",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "alias_default_1255",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1255",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_287",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wk",
+      "name": "einsum_default_531",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_289",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wk",
+      "name": "permute_1055",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1255",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1055",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wk",
+      "name": "einsum_default_532",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_530",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_532",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "add_312",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_531",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wk",
+      "name": "permute_1056",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1056",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wk",
+      "name": "dtype_cast_488",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_488",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wk",
+      "name": "alias_default_1513",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1325",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention",
+      "name": "alias_default_1256",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1256",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_287",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wq",
+      "name": "einsum_default_533",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_288",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wq",
+      "name": "permute_1059",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1256",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1059",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wq",
+      "name": "einsum_default_534",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_312",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_534",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10",
+      "name": "add_313",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_533",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wq",
+      "name": "permute_1060",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1060",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wq",
+      "name": "dtype_cast_489",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_489",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention.wq",
+      "name": "alias_default_1512",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_313",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "convert_element_type_1766",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_283",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "convert_element_type_1767",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_284",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "convert_element_type_1768",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1766",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "alias_default_1257",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1257",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1768",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "mul_666",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1767",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_286",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "mul_667",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_666",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "alias_default_1258",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_667",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "alias_default_1259",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1259",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1258",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "mul_668",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_668",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "sum_133",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1259",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "div_76",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_76",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_133",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "mul_669",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1258",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_669",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "sub_66",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_66",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_286",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "mul_670",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1257",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1259",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "mul_671",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_671",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "sum_134",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_670",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "convert_element_type_1769",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_134",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "convert_element_type_1770",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1253",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1769",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "add_314",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1770",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "dtype_cast_490",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_490",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.10.attention_norm",
+      "name": "alias_default_1519",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_314",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "alias_default_1260",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1260",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_281",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "einsum_default_535",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_282",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "permute_1063",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1260",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1063",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "einsum_default_536",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_535",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "permute_1064",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1064",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "dtype_cast_491",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_491",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "alias_default_1508",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_536",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w2",
+      "name": "alias_default_1261",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1261",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_278",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "mul_672",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1261",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_280",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "mul_673",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_672",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "alias_default_1262",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1262",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_274",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "einsum_default_537",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_279",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "permute_1067",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1262",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1067",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "einsum_default_538",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_537",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "permute_1068",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1068",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "dtype_cast_492",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_492",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w3",
+      "name": "alias_default_1509",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_673",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "convert_element_type_1779",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_276",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "convert_element_type_1780",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1780",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "alias_default_1263",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1263",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "neg_54",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "exp_54",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "add_315",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_315",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "reciprocal_22",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_22",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "mul_674",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_674",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "alias_default_1264",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1779",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1264",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "mul_675",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1264",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "sub_67",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1263",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_67",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "mul_676",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_676",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "add_316",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_675",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_316",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "mul_677",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_677",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "convert_element_type_1781",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1781",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward",
+      "name": "alias_default_1265",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1265",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_274",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "einsum_default_539",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_275",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "permute_1071",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1265",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1071",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "einsum_default_540",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_538",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_540",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9",
+      "name": "add_317",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_539",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "permute_1072",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1072",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "dtype_cast_493",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_493",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.feed_forward.w1",
+      "name": "alias_default_1507",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_317",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "convert_element_type_1786",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_270",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "convert_element_type_1787",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_271",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "convert_element_type_1788",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1786",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "alias_default_1266",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1266",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1788",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "mul_678",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1787",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_273",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "mul_679",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_678",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "alias_default_1267",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_679",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "alias_default_1268",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1268",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1267",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "mul_680",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_680",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "sum_135",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1268",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "div_77",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_77",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_135",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "mul_681",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1267",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_681",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "sub_68",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_68",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_273",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "mul_682",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1266",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1268",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "mul_683",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_683",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "sum_136",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_682",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "convert_element_type_1789",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_136",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "convert_element_type_1790",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1260",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1789",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "add_318",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1790",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "dtype_cast_494",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_494",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.ffn_norm",
+      "name": "alias_default_1511",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_318",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "alias_default_1269",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1269",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_268",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "einsum_default_541",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_269",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "permute_1075",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1269",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1075",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "einsum_default_542",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_541",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "permute_1076",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1076",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "dtype_cast_495",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_495",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wo",
+      "name": "alias_default_1506",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_542",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_1340",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1340",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "permute_1077",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1077",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_264",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_265",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_266",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_267",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_82",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_87",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_88",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_22",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_22",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.sdpa",
+      "name": "getitem_354",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_22",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.sdpa",
+      "name": "getitem_355",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_22",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.sdpa",
+      "name": "getitem_356",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_356",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "permute_1078",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_355",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "permute_1079",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_354",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "permute_1080",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1078",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_1341",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1341",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "sum_137",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_137",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "squeeze_44",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1079",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_1342",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1342",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "sum_138",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_138",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "squeeze_45",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_45",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "convert_element_type_1795",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1080",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "convert_element_type_1796",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1795",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_1343",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1343",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_as_complex_108",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_263",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "_conj_44",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_44",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "clone_246",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_108",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_246",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "mul_684",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1796",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_1344",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1344",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_as_complex_109",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_263",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "_conj_45",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_45",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "clone_247",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_109",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_247",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "mul_685",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_684",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_as_real_108",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_108",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_1345",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1345",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "convert_element_type_1797",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_685",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_as_real_109",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_109",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_1346",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1346",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "convert_element_type_1798",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_44",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_1347",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1797",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_1348",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1798",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "view_1349",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1347",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "alias_default_1270",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1270",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_259",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wv",
+      "name": "einsum_default_543",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_262",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wv",
+      "name": "permute_1083",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1270",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1083",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wv",
+      "name": "einsum_default_544",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_543",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wv",
+      "name": "permute_1084",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1084",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wv",
+      "name": "dtype_cast_496",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_496",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wv",
+      "name": "alias_default_1505",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1348",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "alias_default_1271",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1271",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_259",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wk",
+      "name": "einsum_default_545",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_261",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wk",
+      "name": "permute_1087",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1271",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1087",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wk",
+      "name": "einsum_default_546",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_544",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_546",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "add_319",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_545",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wk",
+      "name": "permute_1088",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1088",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wk",
+      "name": "dtype_cast_497",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_497",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wk",
+      "name": "alias_default_1504",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1349",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention",
+      "name": "alias_default_1272",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1272",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_259",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wq",
+      "name": "einsum_default_547",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_260",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wq",
+      "name": "permute_1091",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1272",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1091",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wq",
+      "name": "einsum_default_548",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_319",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_548",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9",
+      "name": "add_320",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_547",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wq",
+      "name": "permute_1092",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1092",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wq",
+      "name": "dtype_cast_498",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_498",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention.wq",
+      "name": "alias_default_1503",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_320",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "convert_element_type_1811",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_255",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "convert_element_type_1812",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_256",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "convert_element_type_1813",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1811",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "alias_default_1273",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1273",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1813",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "mul_686",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1812",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_258",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "mul_687",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_686",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "alias_default_1274",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_687",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "alias_default_1275",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1275",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1274",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "mul_688",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_688",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "sum_139",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1275",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "div_78",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_78",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_139",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "mul_689",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1274",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_689",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "sub_69",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_69",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_258",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "mul_690",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1273",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1275",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "mul_691",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_691",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "sum_140",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_690",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "convert_element_type_1814",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_140",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "convert_element_type_1815",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1269",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1814",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "add_321",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1815",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "dtype_cast_499",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_499",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.9.attention_norm",
+      "name": "alias_default_1510",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_321",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "alias_default_1276",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1276",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_253",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "einsum_default_549",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_254",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "permute_1095",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1276",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1095",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "einsum_default_550",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_549",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "permute_1096",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1096",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "dtype_cast_500",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_500",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "alias_default_1499",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_550",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w2",
+      "name": "alias_default_1277",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1277",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_250",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "mul_692",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1277",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_252",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "mul_693",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_692",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "alias_default_1278",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1278",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_246",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "einsum_default_551",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_251",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "permute_1099",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1278",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1099",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "einsum_default_552",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_551",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "permute_1100",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1100",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "dtype_cast_501",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_501",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w3",
+      "name": "alias_default_1500",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_693",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "convert_element_type_1824",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_248",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "convert_element_type_1825",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1825",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "alias_default_1279",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1279",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "neg_55",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_55",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "exp_55",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_55",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "add_322",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_322",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "reciprocal_23",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_23",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "mul_694",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_694",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "alias_default_1280",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1824",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1280",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "mul_695",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1280",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "sub_70",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1279",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_70",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "mul_696",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_696",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "add_323",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_695",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_323",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "mul_697",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_697",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "convert_element_type_1826",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1826",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward",
+      "name": "alias_default_1281",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1281",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_246",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "einsum_default_553",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_247",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "permute_1103",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1281",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1103",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "einsum_default_554",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_552",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_554",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8",
+      "name": "add_324",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_553",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "permute_1104",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1104",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "dtype_cast_502",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_502",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.feed_forward.w1",
+      "name": "alias_default_1498",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_324",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "convert_element_type_1831",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_242",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "convert_element_type_1832",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_243",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "convert_element_type_1833",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1831",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "alias_default_1282",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1282",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1833",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "mul_698",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1832",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_245",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "mul_699",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_698",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "alias_default_1283",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_699",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "alias_default_1284",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1284",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1283",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "mul_700",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_700",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "sum_141",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1284",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "div_79",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_79",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_141",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "mul_701",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1283",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_701",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "sub_71",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_71",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_245",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "mul_702",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1282",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1284",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "mul_703",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_703",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "sum_142",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_702",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "convert_element_type_1834",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_142",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "convert_element_type_1835",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1276",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1834",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "add_325",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1835",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "dtype_cast_503",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_503",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.ffn_norm",
+      "name": "alias_default_1502",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_325",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "alias_default_1285",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1285",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_240",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "einsum_default_555",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_241",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "permute_1107",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1285",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1107",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "einsum_default_556",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_555",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "permute_1108",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1108",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "dtype_cast_504",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_504",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wo",
+      "name": "alias_default_1497",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_556",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_1364",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1364",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "permute_1109",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1109",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_236",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_237",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_238",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_239",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_73",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_78",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_79",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_23",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_23",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.sdpa",
+      "name": "getitem_357",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_23",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.sdpa",
+      "name": "getitem_358",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_23",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.sdpa",
+      "name": "getitem_359",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_359",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "permute_1110",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_358",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "permute_1111",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_357",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "permute_1112",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1110",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_1365",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1365",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "sum_143",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_143",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "squeeze_46",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1111",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_1366",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1366",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "sum_144",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_144",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "squeeze_47",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_47",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "convert_element_type_1840",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1112",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "convert_element_type_1841",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1840",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_1367",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1367",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_as_complex_110",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_235",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "_conj_46",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_46",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "clone_254",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_110",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_254",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "mul_704",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1841",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_1368",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1368",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_as_complex_111",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_235",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "_conj_47",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_47",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "clone_255",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_111",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_255",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "mul_705",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_704",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_as_real_110",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_110",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_1369",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1369",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "convert_element_type_1842",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_705",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_as_real_111",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_111",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_1370",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1370",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "convert_element_type_1843",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_46",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_1371",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1842",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_1372",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1843",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "view_1373",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1371",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "alias_default_1286",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1286",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_231",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wv",
+      "name": "einsum_default_557",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_234",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wv",
+      "name": "permute_1115",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1286",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1115",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wv",
+      "name": "einsum_default_558",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_557",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wv",
+      "name": "permute_1116",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1116",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wv",
+      "name": "dtype_cast_505",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_505",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wv",
+      "name": "alias_default_1496",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1372",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "alias_default_1287",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1287",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_231",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wk",
+      "name": "einsum_default_559",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_233",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wk",
+      "name": "permute_1119",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1287",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1119",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wk",
+      "name": "einsum_default_560",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_558",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_560",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "add_326",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_559",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wk",
+      "name": "permute_1120",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1120",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wk",
+      "name": "dtype_cast_506",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_506",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wk",
+      "name": "alias_default_1495",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1373",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention",
+      "name": "alias_default_1288",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1288",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_231",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wq",
+      "name": "einsum_default_561",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_232",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wq",
+      "name": "permute_1123",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1288",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1123",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wq",
+      "name": "einsum_default_562",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_326",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_562",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8",
+      "name": "add_327",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_561",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wq",
+      "name": "permute_1124",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1124",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wq",
+      "name": "dtype_cast_507",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_507",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention.wq",
+      "name": "alias_default_1494",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_327",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "convert_element_type_1856",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_227",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "convert_element_type_1857",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_228",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "convert_element_type_1858",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1856",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "alias_default_1289",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1289",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1858",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "mul_706",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1857",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_230",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "mul_707",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_706",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "alias_default_1290",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_707",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "alias_default_1291",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1291",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1290",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "mul_708",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_708",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "sum_145",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1291",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "div_80",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_80",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_145",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "mul_709",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1290",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_709",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "sub_72",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_72",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_230",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "mul_710",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1289",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1291",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "mul_711",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_711",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "sum_146",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_710",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "convert_element_type_1859",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_146",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "convert_element_type_1860",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1285",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1859",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "add_328",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1860",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "dtype_cast_508",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_508",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.8.attention_norm",
+      "name": "alias_default_1501",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_328",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "alias_default_1292",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1292",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_225",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "einsum_default_563",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_226",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "permute_1127",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1292",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1127",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "einsum_default_564",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_563",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "permute_1128",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1128",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "dtype_cast_509",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_509",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "alias_default_1490",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_564",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w2",
+      "name": "alias_default_1293",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1293",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_222",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "mul_712",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1293",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_224",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "mul_713",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_712",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "alias_default_1294",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1294",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_218",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "einsum_default_565",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_223",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "permute_1131",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1294",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1131",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "einsum_default_566",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_565",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "permute_1132",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1132",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "dtype_cast_510",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_510",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w3",
+      "name": "alias_default_1491",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_713",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "convert_element_type_1869",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_220",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "convert_element_type_1870",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1870",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "alias_default_1295",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1295",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "neg_56",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "exp_56",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "add_329",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_329",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "reciprocal_24",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "mul_714",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_714",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "alias_default_1296",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1869",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1296",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "mul_715",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1296",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "sub_73",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1295",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_73",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "mul_716",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_716",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "add_330",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_715",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_330",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "mul_717",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_717",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "convert_element_type_1871",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1871",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward",
+      "name": "alias_default_1297",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1297",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_218",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "einsum_default_567",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_219",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "permute_1135",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1297",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1135",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "einsum_default_568",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_566",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_568",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7",
+      "name": "add_331",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_567",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "permute_1136",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1136",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "dtype_cast_511",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_511",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.feed_forward.w1",
+      "name": "alias_default_1489",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_331",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "convert_element_type_1876",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_214",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "convert_element_type_1877",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_215",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "convert_element_type_1878",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1876",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "alias_default_1298",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1298",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1878",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "mul_718",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1877",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_217",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "mul_719",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_718",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "alias_default_1299",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_719",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "alias_default_1300",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1300",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1299",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "mul_720",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_720",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "sum_147",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1300",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "div_81",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_81",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_147",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "mul_721",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1299",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_721",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "sub_74",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_74",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_217",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "mul_722",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1298",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1300",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "mul_723",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_723",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "sum_148",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_722",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "convert_element_type_1879",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_148",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "convert_element_type_1880",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1292",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1879",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "add_332",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1880",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "dtype_cast_512",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_512",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.ffn_norm",
+      "name": "alias_default_1493",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_332",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "alias_default_1301",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1301",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_212",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "einsum_default_569",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_213",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "permute_1139",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1301",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1139",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "einsum_default_570",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_569",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "permute_1140",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1140",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "dtype_cast_513",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_513",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wo",
+      "name": "alias_default_1488",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_570",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_1388",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1388",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "permute_1141",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1141",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_208",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_209",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_210",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_211",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_64",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_69",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_70",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_24",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.sdpa",
+      "name": "getitem_360",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.sdpa",
+      "name": "getitem_361",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_24",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.sdpa",
+      "name": "getitem_362",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_362",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "permute_1142",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_361",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "permute_1143",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_360",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "permute_1144",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1142",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_1389",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1389",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "sum_149",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_149",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "squeeze_48",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1143",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_1390",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1390",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "sum_150",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_150",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "squeeze_49",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_49",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "convert_element_type_1885",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1144",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "convert_element_type_1886",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1885",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_1391",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1391",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_as_complex_112",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_207",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "_conj_48",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_48",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "clone_262",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_112",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_262",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "mul_724",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1886",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_1392",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1392",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_as_complex_113",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_207",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "_conj_49",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_49",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "clone_263",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_113",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_263",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "mul_725",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_724",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_as_real_112",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_112",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_1393",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1393",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "convert_element_type_1887",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_725",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_as_real_113",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_113",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_1394",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1394",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "convert_element_type_1888",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_48",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_1395",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1887",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_1396",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1888",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "view_1397",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1395",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "alias_default_1302",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1302",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_203",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wv",
+      "name": "einsum_default_571",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_206",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wv",
+      "name": "permute_1147",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1302",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1147",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wv",
+      "name": "einsum_default_572",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_571",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wv",
+      "name": "permute_1148",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1148",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wv",
+      "name": "dtype_cast_514",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_514",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wv",
+      "name": "alias_default_1487",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1396",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "alias_default_1303",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1303",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_203",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wk",
+      "name": "einsum_default_573",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_205",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wk",
+      "name": "permute_1151",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1303",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1151",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wk",
+      "name": "einsum_default_574",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_572",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_574",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "add_333",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_573",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wk",
+      "name": "permute_1152",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1152",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wk",
+      "name": "dtype_cast_515",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_515",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wk",
+      "name": "alias_default_1486",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1397",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention",
+      "name": "alias_default_1304",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1304",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_203",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wq",
+      "name": "einsum_default_575",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_204",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wq",
+      "name": "permute_1155",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1304",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1155",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wq",
+      "name": "einsum_default_576",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_333",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_576",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7",
+      "name": "add_334",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_575",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wq",
+      "name": "permute_1156",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1156",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wq",
+      "name": "dtype_cast_516",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_516",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention.wq",
+      "name": "alias_default_1485",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_334",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "convert_element_type_1901",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_199",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "convert_element_type_1902",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_200",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "convert_element_type_1903",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1901",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "alias_default_1305",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1305",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1903",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "mul_726",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1902",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_202",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "mul_727",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_726",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "alias_default_1306",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_727",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "alias_default_1307",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1307",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1306",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "mul_728",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_728",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "sum_151",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1307",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "div_82",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_82",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_151",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "mul_729",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1306",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_729",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "sub_75",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_75",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_202",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "mul_730",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1305",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1307",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "mul_731",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_731",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "sum_152",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_730",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "convert_element_type_1904",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_152",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "convert_element_type_1905",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1301",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1904",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "add_335",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1905",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "dtype_cast_517",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_517",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.7.attention_norm",
+      "name": "alias_default_1492",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_335",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "alias_default_1308",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1308",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_197",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "einsum_default_577",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_198",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "permute_1159",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1308",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1159",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "einsum_default_578",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_577",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "permute_1160",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1160",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "dtype_cast_518",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_518",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "alias_default_1481",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_578",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w2",
+      "name": "alias_default_1309",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1309",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_194",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "mul_732",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1309",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_196",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "mul_733",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_732",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "alias_default_1310",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1310",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_190",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "einsum_default_579",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_195",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "permute_1163",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1310",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1163",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "einsum_default_580",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_579",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "permute_1164",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1164",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "dtype_cast_519",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_519",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w3",
+      "name": "alias_default_1482",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_733",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "convert_element_type_1914",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_192",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "convert_element_type_1915",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1915",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "alias_default_1311",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1311",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "neg_57",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "exp_57",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "add_336",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_336",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "reciprocal_25",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_25",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "mul_734",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_734",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "alias_default_1312",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1914",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1312",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "mul_735",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1312",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "sub_76",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1311",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_76",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "mul_736",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_736",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "add_337",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_735",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_337",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "mul_737",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_737",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "convert_element_type_1916",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1916",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward",
+      "name": "alias_default_1313",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1313",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_190",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "einsum_default_581",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_191",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "permute_1167",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1313",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1167",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "einsum_default_582",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_580",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_582",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6",
+      "name": "add_338",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_581",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "permute_1168",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1168",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "dtype_cast_520",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_520",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.feed_forward.w1",
+      "name": "alias_default_1480",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_338",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "convert_element_type_1921",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_186",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "convert_element_type_1922",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_187",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "convert_element_type_1923",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1921",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "alias_default_1314",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1314",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1923",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "mul_738",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1922",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_189",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "mul_739",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_738",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "alias_default_1315",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_739",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "alias_default_1316",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1316",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1315",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "mul_740",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_740",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "sum_153",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1316",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "div_83",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_83",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_153",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "mul_741",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1315",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_741",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "sub_77",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_77",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_189",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "mul_742",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1314",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1316",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "mul_743",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_743",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "sum_154",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_742",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "convert_element_type_1924",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_154",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "convert_element_type_1925",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1308",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1924",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "add_339",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1925",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "dtype_cast_521",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_521",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.ffn_norm",
+      "name": "alias_default_1484",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_339",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "alias_default_1317",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1317",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_184",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "einsum_default_583",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_185",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "permute_1171",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1317",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1171",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "einsum_default_584",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_583",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "permute_1172",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1172",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "dtype_cast_522",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_522",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wo",
+      "name": "alias_default_1479",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_584",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_1412",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1412",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "permute_1173",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1173",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_180",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_181",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_182",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_183",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_55",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_60",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_61",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_25",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.sdpa",
+      "name": "getitem_363",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.sdpa",
+      "name": "getitem_364",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_25",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.sdpa",
+      "name": "getitem_365",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_365",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "permute_1174",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_364",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "permute_1175",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_363",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "permute_1176",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1174",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_1413",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1413",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "sum_155",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_155",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "squeeze_50",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1175",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_1414",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1414",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "sum_156",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_156",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "squeeze_51",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_51",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "convert_element_type_1930",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1176",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "convert_element_type_1931",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1930",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_1415",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1415",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_as_complex_114",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_179",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "_conj_50",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_50",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "clone_270",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_114",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_270",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "mul_744",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1931",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_1416",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1416",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_as_complex_115",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_179",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "_conj_51",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_51",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "clone_271",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_115",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_271",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "mul_745",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_744",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_as_real_114",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_114",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_1417",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1417",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "convert_element_type_1932",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_745",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_as_real_115",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_115",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_1418",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1418",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "convert_element_type_1933",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_50",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_1419",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1932",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_1420",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1933",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "view_1421",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1419",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "alias_default_1318",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1318",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_175",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wv",
+      "name": "einsum_default_585",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_178",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wv",
+      "name": "permute_1179",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1318",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1179",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wv",
+      "name": "einsum_default_586",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_585",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wv",
+      "name": "permute_1180",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1180",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wv",
+      "name": "dtype_cast_523",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_523",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wv",
+      "name": "alias_default_1478",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1420",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "alias_default_1319",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1319",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_175",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wk",
+      "name": "einsum_default_587",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_177",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wk",
+      "name": "permute_1183",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1319",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1183",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wk",
+      "name": "einsum_default_588",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_586",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_588",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "add_340",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_587",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wk",
+      "name": "permute_1184",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1184",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wk",
+      "name": "dtype_cast_524",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_524",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wk",
+      "name": "alias_default_1477",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1421",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention",
+      "name": "alias_default_1320",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1320",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_175",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wq",
+      "name": "einsum_default_589",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_176",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wq",
+      "name": "permute_1187",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1320",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1187",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wq",
+      "name": "einsum_default_590",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_340",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_590",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6",
+      "name": "add_341",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_589",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wq",
+      "name": "permute_1188",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1188",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wq",
+      "name": "dtype_cast_525",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_525",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention.wq",
+      "name": "alias_default_1476",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_341",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "convert_element_type_1946",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "convert_element_type_1947",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_172",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "convert_element_type_1948",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1946",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "alias_default_1321",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1321",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1948",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "mul_746",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1947",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_174",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "mul_747",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_746",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "alias_default_1322",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_747",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "alias_default_1323",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1323",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1322",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "mul_748",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_748",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "sum_157",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1323",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "div_84",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_84",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_157",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "mul_749",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1322",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_749",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "sub_78",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_78",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_174",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "mul_750",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1321",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1323",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "mul_751",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_751",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "sum_158",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_750",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "convert_element_type_1949",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_158",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "convert_element_type_1950",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1317",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1949",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "add_342",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1950",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "dtype_cast_526",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_526",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.6.attention_norm",
+      "name": "alias_default_1483",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_342",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "alias_default_1324",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1324",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_169",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "einsum_default_591",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_170",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "permute_1191",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1324",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1191",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "einsum_default_592",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_591",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "permute_1192",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1192",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "dtype_cast_527",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_527",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "alias_default_1472",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_592",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w2",
+      "name": "alias_default_1325",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1325",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_166",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "mul_752",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1325",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_168",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "mul_753",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_752",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "alias_default_1326",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1326",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_162",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "einsum_default_593",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_167",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "permute_1195",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1326",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1195",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "einsum_default_594",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_593",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "permute_1196",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1196",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "dtype_cast_528",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_528",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w3",
+      "name": "alias_default_1473",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_753",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "convert_element_type_1959",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_164",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "convert_element_type_1960",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1960",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "alias_default_1327",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1327",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "neg_58",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "exp_58",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "add_343",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_343",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "reciprocal_26",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "mul_754",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_754",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "alias_default_1328",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1959",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1328",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "mul_755",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1328",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "sub_79",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1327",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_79",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "mul_756",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_756",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "add_344",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_755",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_344",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "mul_757",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_757",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "convert_element_type_1961",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1961",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward",
+      "name": "alias_default_1329",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1329",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_162",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "einsum_default_595",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_163",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "permute_1199",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1329",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1199",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "einsum_default_596",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_594",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_596",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5",
+      "name": "add_345",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_595",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "permute_1200",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1200",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "dtype_cast_529",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_529",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.feed_forward.w1",
+      "name": "alias_default_1471",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_345",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "convert_element_type_1966",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_158",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "convert_element_type_1967",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_159",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "convert_element_type_1968",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1966",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "alias_default_1330",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1330",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1968",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "mul_758",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1967",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_161",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "mul_759",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_758",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "alias_default_1331",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_759",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "alias_default_1332",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1332",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1331",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "mul_760",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_760",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "sum_159",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1332",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "div_85",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_85",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_159",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "mul_761",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1331",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_761",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "sub_80",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_80",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_161",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "mul_762",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1330",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1332",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "mul_763",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_763",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "sum_160",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_762",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "convert_element_type_1969",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_160",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "convert_element_type_1970",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1324",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1969",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "add_346",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1970",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "dtype_cast_530",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_530",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.ffn_norm",
+      "name": "alias_default_1475",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_346",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "alias_default_1333",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1333",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_156",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "einsum_default_597",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_157",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "permute_1203",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1333",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1203",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "einsum_default_598",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_597",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "permute_1204",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1204",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "dtype_cast_531",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_531",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wo",
+      "name": "alias_default_1470",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_598",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_1436",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1436",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "permute_1205",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1205",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_152",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_153",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_154",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_155",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_46",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_51",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_52",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_26",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.sdpa",
+      "name": "getitem_366",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.sdpa",
+      "name": "getitem_367",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_26",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.sdpa",
+      "name": "getitem_368",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_368",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "permute_1206",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_367",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "permute_1207",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_366",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "permute_1208",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1206",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_1437",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1437",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "sum_161",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_161",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "squeeze_52",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1207",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_1438",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1438",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "sum_162",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_162",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "squeeze_53",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_53",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "convert_element_type_1975",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1208",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "convert_element_type_1976",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1975",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_1439",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1439",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_as_complex_116",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_151",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "_conj_52",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_52",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "clone_278",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_116",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_278",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "mul_764",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1976",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_1440",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1440",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_as_complex_117",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_151",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "_conj_53",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_53",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "clone_279",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_117",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_279",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "mul_765",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_764",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_as_real_116",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_116",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_1441",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1441",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "convert_element_type_1977",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_765",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_as_real_117",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_117",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_1442",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1442",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "convert_element_type_1978",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_1443",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1977",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_1444",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_1978",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "view_1445",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1443",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "alias_default_1334",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1334",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_147",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wv",
+      "name": "einsum_default_599",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_150",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wv",
+      "name": "permute_1211",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1334",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1211",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wv",
+      "name": "einsum_default_600",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_599",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wv",
+      "name": "permute_1212",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1212",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wv",
+      "name": "dtype_cast_532",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_532",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wv",
+      "name": "alias_default_1469",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1444",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "alias_default_1335",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1335",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_147",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wk",
+      "name": "einsum_default_601",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_149",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wk",
+      "name": "permute_1215",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1335",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1215",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wk",
+      "name": "einsum_default_602",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_600",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_602",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "add_347",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_601",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wk",
+      "name": "permute_1216",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1216",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wk",
+      "name": "dtype_cast_533",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_533",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wk",
+      "name": "alias_default_1468",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1445",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention",
+      "name": "alias_default_1336",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1336",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_147",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wq",
+      "name": "einsum_default_603",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_148",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wq",
+      "name": "permute_1219",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1336",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1219",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wq",
+      "name": "einsum_default_604",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_347",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_604",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5",
+      "name": "add_348",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_603",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wq",
+      "name": "permute_1220",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1220",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wq",
+      "name": "dtype_cast_534",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_534",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention.wq",
+      "name": "alias_default_1467",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_348",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "convert_element_type_1991",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_143",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "convert_element_type_1992",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_144",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "convert_element_type_1993",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1991",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "alias_default_1337",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1337",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_1993",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "mul_766",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1992",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_146",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "mul_767",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_766",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "alias_default_1338",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_767",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "alias_default_1339",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1339",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1338",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "mul_768",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_768",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "sum_163",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1339",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "div_86",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_86",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_163",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "mul_769",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1338",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_769",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "sub_81",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_81",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_146",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "mul_770",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1337",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1339",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "mul_771",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_771",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "sum_164",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_770",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "convert_element_type_1994",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_164",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "convert_element_type_1995",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1333",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_1994",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "add_349",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_1995",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "dtype_cast_535",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_535",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.5.attention_norm",
+      "name": "alias_default_1474",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_349",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "alias_default_1340",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1340",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_141",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "einsum_default_605",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_142",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "permute_1223",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1340",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1223",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "einsum_default_606",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_605",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "permute_1224",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1224",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "dtype_cast_536",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_536",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "alias_default_1463",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_606",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w2",
+      "name": "alias_default_1341",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1341",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_138",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "mul_772",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1341",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_140",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "mul_773",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_772",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "alias_default_1342",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1342",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_134",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "einsum_default_607",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_139",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "permute_1227",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1342",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1227",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "einsum_default_608",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_607",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "permute_1228",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1228",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "dtype_cast_537",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_537",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w3",
+      "name": "alias_default_1464",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_773",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "convert_element_type_2004",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_136",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "convert_element_type_2005",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2005",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "alias_default_1343",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1343",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "neg_59",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "exp_59",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "add_350",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_350",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "reciprocal_27",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_27",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "mul_774",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_774",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "alias_default_1344",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2004",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1344",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "mul_775",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1344",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "sub_82",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1343",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_82",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "mul_776",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_776",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "add_351",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_775",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_351",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "mul_777",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_777",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "convert_element_type_2006",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2006",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward",
+      "name": "alias_default_1345",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1345",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_134",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "einsum_default_609",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_135",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "permute_1231",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1345",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1231",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "einsum_default_610",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_608",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_610",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4",
+      "name": "add_352",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_609",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "permute_1232",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1232",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "dtype_cast_538",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_538",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.feed_forward.w1",
+      "name": "alias_default_1462",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_352",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "convert_element_type_2011",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_130",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "convert_element_type_2012",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_131",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "convert_element_type_2013",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2011",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "alias_default_1346",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1346",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_2013",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "mul_778",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2012",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_133",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "mul_779",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_778",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "alias_default_1347",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_779",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "alias_default_1348",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1348",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1347",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "mul_780",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_780",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "sum_165",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1348",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "div_87",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_87",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_165",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "mul_781",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1347",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_781",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "sub_83",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_83",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_133",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "mul_782",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1346",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1348",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "mul_783",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_783",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "sum_166",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_782",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "convert_element_type_2014",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_166",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "convert_element_type_2015",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1340",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2014",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "add_353",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_2015",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "dtype_cast_539",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_539",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.ffn_norm",
+      "name": "alias_default_1466",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_353",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "alias_default_1349",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1349",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_128",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "einsum_default_611",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_129",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "permute_1235",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1349",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1235",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "einsum_default_612",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_611",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "permute_1236",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1236",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "dtype_cast_540",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_540",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wo",
+      "name": "alias_default_1461",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_612",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_1460",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1460",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "permute_1237",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1237",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_124",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_125",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_126",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_127",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_37",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_42",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_43",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_27",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.sdpa",
+      "name": "getitem_369",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.sdpa",
+      "name": "getitem_370",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_27",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.sdpa",
+      "name": "getitem_371",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_371",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "permute_1238",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_370",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "permute_1239",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_369",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "permute_1240",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1238",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_1461",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1461",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "sum_167",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_167",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "squeeze_54",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1239",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_1462",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1462",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "sum_168",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_168",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "squeeze_55",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_55",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "convert_element_type_2020",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1240",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "convert_element_type_2021",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2020",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_1463",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1463",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_as_complex_118",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_123",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "_conj_54",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_54",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "clone_286",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_118",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_286",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "mul_784",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2021",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_1464",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1464",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_as_complex_119",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_123",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "_conj_55",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_55",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "clone_287",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_119",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_287",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "mul_785",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_784",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_as_real_118",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_118",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_1465",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1465",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "convert_element_type_2022",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_785",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_as_real_119",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_119",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_1466",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1466",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "convert_element_type_2023",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_1467",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2022",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_1468",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2023",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "view_1469",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1467",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "alias_default_1350",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1350",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_119",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wv",
+      "name": "einsum_default_613",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_122",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wv",
+      "name": "permute_1243",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1350",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1243",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wv",
+      "name": "einsum_default_614",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_613",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wv",
+      "name": "permute_1244",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1244",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wv",
+      "name": "dtype_cast_541",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_541",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wv",
+      "name": "alias_default_1460",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1468",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "alias_default_1351",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1351",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_119",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wk",
+      "name": "einsum_default_615",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_121",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wk",
+      "name": "permute_1247",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1351",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1247",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wk",
+      "name": "einsum_default_616",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_614",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_616",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "add_354",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_615",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wk",
+      "name": "permute_1248",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1248",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wk",
+      "name": "dtype_cast_542",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_542",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wk",
+      "name": "alias_default_1459",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1469",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention",
+      "name": "alias_default_1352",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1352",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_119",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wq",
+      "name": "einsum_default_617",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_120",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wq",
+      "name": "permute_1251",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1352",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1251",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wq",
+      "name": "einsum_default_618",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_354",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_618",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4",
+      "name": "add_355",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_617",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wq",
+      "name": "permute_1252",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1252",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wq",
+      "name": "dtype_cast_543",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_543",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention.wq",
+      "name": "alias_default_1458",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_355",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "convert_element_type_2036",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_115",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "convert_element_type_2037",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_116",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "convert_element_type_2038",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2036",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "alias_default_1353",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1353",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_2038",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "mul_786",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2037",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_118",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "mul_787",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_786",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "alias_default_1354",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_787",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "alias_default_1355",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1355",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1354",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "mul_788",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_788",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "sum_169",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1355",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "div_88",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_88",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_169",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "mul_789",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1354",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_789",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "sub_84",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_84",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_118",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "mul_790",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1353",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1355",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "mul_791",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_791",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "sum_170",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_790",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "convert_element_type_2039",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_170",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "convert_element_type_2040",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1349",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2039",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "add_356",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_2040",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "dtype_cast_544",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_544",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.4.attention_norm",
+      "name": "alias_default_1465",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_356",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "alias_default_1356",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1356",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_113",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "einsum_default_619",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_114",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "permute_1255",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1356",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1255",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "einsum_default_620",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_619",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "permute_1256",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1256",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "dtype_cast_545",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_545",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "alias_default_1454",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_620",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w2",
+      "name": "alias_default_1357",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1357",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_110",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "mul_792",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1357",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_112",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "mul_793",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_792",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "alias_default_1358",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1358",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_106",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "einsum_default_621",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_111",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "permute_1259",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1358",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1259",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "einsum_default_622",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_621",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "permute_1260",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1260",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "dtype_cast_546",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_546",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w3",
+      "name": "alias_default_1455",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_793",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "convert_element_type_2049",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_108",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "convert_element_type_2050",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2050",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "alias_default_1359",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1359",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "neg_60",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "exp_60",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "add_357",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_357",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "reciprocal_28",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "mul_794",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_794",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "alias_default_1360",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2049",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1360",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "mul_795",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1360",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "sub_85",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1359",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_85",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "mul_796",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_796",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "add_358",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_795",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_358",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "mul_797",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_797",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "convert_element_type_2051",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2051",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward",
+      "name": "alias_default_1361",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1361",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_106",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "einsum_default_623",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_107",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "permute_1263",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1361",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1263",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "einsum_default_624",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_622",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_624",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3",
+      "name": "add_359",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_623",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "permute_1264",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1264",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "dtype_cast_547",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_547",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.feed_forward.w1",
+      "name": "alias_default_1453",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_359",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "convert_element_type_2056",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_102",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "convert_element_type_2057",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_103",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "convert_element_type_2058",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2056",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "alias_default_1362",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1362",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_2058",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "mul_798",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2057",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_105",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "mul_799",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_798",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "alias_default_1363",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_799",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "alias_default_1364",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1364",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1363",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "mul_800",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_800",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "sum_171",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1364",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "div_89",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_89",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "mul_801",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1363",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_801",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "sub_86",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_86",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_105",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "mul_802",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1362",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1364",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "mul_803",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_803",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "sum_172",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_802",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "convert_element_type_2059",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_172",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "convert_element_type_2060",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1356",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2059",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "add_360",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_2060",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "dtype_cast_548",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_548",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.ffn_norm",
+      "name": "alias_default_1457",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_360",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "alias_default_1365",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1365",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_100",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "einsum_default_625",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_101",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "permute_1267",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1365",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1267",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "einsum_default_626",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_625",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "permute_1268",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1268",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "dtype_cast_549",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_549",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wo",
+      "name": "alias_default_1452",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_626",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_1484",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1484",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "permute_1269",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1269",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_96",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_97",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_98",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_99",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_28",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_33",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_34",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_28",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_28",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.sdpa",
+      "name": "getitem_372",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_28",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.sdpa",
+      "name": "getitem_373",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_28",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.sdpa",
+      "name": "getitem_374",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_374",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "permute_1270",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_373",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "permute_1271",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_372",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "permute_1272",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1270",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_1485",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1485",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "sum_173",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_173",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "squeeze_56",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1271",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_1486",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1486",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "sum_174",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_174",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "squeeze_57",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "convert_element_type_2065",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1272",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "convert_element_type_2066",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2065",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_1487",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1487",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_as_complex_120",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_95",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "_conj_56",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_56",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "clone_294",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_120",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_294",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "mul_804",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2066",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_1488",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1488",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_as_complex_121",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_95",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "_conj_57",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_57",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "clone_295",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_121",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_295",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "mul_805",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_804",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_as_real_120",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_120",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_1489",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1489",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "convert_element_type_2067",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_805",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_as_real_121",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_121",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_1490",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1490",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "convert_element_type_2068",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_1491",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2067",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_1492",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2068",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "view_1493",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1491",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "alias_default_1366",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1366",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_91",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wv",
+      "name": "einsum_default_627",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_94",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wv",
+      "name": "permute_1275",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1366",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1275",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wv",
+      "name": "einsum_default_628",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_627",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wv",
+      "name": "permute_1276",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1276",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wv",
+      "name": "dtype_cast_550",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_550",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wv",
+      "name": "alias_default_1451",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1492",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "alias_default_1367",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1367",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_91",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wk",
+      "name": "einsum_default_629",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_93",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wk",
+      "name": "permute_1279",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1367",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1279",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wk",
+      "name": "einsum_default_630",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_628",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_630",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "add_361",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_629",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wk",
+      "name": "permute_1280",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1280",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wk",
+      "name": "dtype_cast_551",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_551",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wk",
+      "name": "alias_default_1450",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1493",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention",
+      "name": "alias_default_1368",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1368",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_91",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wq",
+      "name": "einsum_default_631",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_92",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wq",
+      "name": "permute_1283",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1368",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1283",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wq",
+      "name": "einsum_default_632",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_361",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_632",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3",
+      "name": "add_362",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_631",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wq",
+      "name": "permute_1284",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1284",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wq",
+      "name": "dtype_cast_552",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_552",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention.wq",
+      "name": "alias_default_1449",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_362",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "convert_element_type_2081",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_87",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "convert_element_type_2082",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_88",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "convert_element_type_2083",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2081",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "alias_default_1369",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1369",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_2083",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "mul_806",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2082",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_90",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "mul_807",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_806",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "alias_default_1370",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_807",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "alias_default_1371",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1371",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1370",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "mul_808",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_808",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "sum_175",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1371",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "div_90",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_90",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_175",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "mul_809",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1370",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_809",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "sub_87",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_87",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_90",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "mul_810",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1369",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1371",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "mul_811",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_811",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "sum_176",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_810",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "convert_element_type_2084",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_176",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "convert_element_type_2085",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1365",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2084",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "add_363",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_2085",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "dtype_cast_553",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_553",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.3.attention_norm",
+      "name": "alias_default_1456",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_363",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "alias_default_1372",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1372",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_85",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "einsum_default_633",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "cluster_root": "permute_1319",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_86",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "permute_1287",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "cluster_root": "einsum_default_648",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1372",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1287",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "einsum_default_634",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_633",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "permute_1288",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1288",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "dtype_cast_554",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_554",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "alias_default_1445",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "cluster_root": "alias_default_1389",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_634",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w2",
+      "name": "alias_default_1373",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "cluster_root": "mul_832",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1373",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_82",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "mul_812",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "cluster_root": "mul_833",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1373",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_84",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "mul_813",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "cluster_root": "alias_default_1390",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_812",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "alias_default_1374",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1374",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_78",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "einsum_default_635",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "cluster_root": "permute_1323",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_83",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "permute_1291",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "cluster_root": "einsum_default_650",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1374",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1291",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "einsum_default_636",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_635",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "permute_1292",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1292",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "dtype_cast_555",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_555",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w3",
+      "name": "alias_default_1446",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "cluster_root": "convert_element_type_2139",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_813",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "convert_element_type_2094",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "cluster_root": "convert_element_type_2140",
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_80",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "convert_element_type_2095",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "cluster_root": "alias_default_1391",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2095",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "alias_default_1375",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "cluster_root": "neg_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1375",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "neg_61",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "cluster_root": "exp_62",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "exp_61",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "cluster_root": "add_371",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "add_364",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "cluster_root": "reciprocal_30",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_364",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "reciprocal_29",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "cluster_root": "mul_834",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "mul_814",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "cluster_root": "alias_default_1392",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_814",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "alias_default_1376",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "cluster_root": "mul_835",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2094",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1376",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "mul_815",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "cluster_root": "sub_91",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1376",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "sub_88",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "cluster_root": "mul_836",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1375",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_88",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "mul_816",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "cluster_root": "add_372",
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_816",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "add_365",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "cluster_root": "mul_837",
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_815",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_365",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "mul_817",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "cluster_root": "convert_element_type_2141",
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_817",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "convert_element_type_2096",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "cluster_root": "alias_default_1393",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2096",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward",
+      "name": "alias_default_1377",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1377",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_78",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "einsum_default_637",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "cluster_root": "permute_1327",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_79",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "permute_1295",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "cluster_root": "einsum_default_652",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1377",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1295",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "einsum_default_638",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_636",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_638",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2",
+      "name": "add_366",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_637",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "permute_1296",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1296",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "dtype_cast_556",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_556",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.feed_forward.w1",
+      "name": "alias_default_1444",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "cluster_root": "convert_element_type_2146",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_366",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "convert_element_type_2101",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "cluster_root": "convert_element_type_2147",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_74",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "convert_element_type_2102",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "cluster_root": "convert_element_type_2148",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_75",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "convert_element_type_2103",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "cluster_root": "alias_default_1394",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2101",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "alias_default_1378",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "cluster_root": "mul_838",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1378",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_2103",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "mul_818",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "cluster_root": "mul_839",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2102",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_77",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "mul_819",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "cluster_root": "alias_default_1395",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_818",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "alias_default_1379",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "cluster_root": "alias_default_1396",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_819",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "alias_default_1380",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "cluster_root": "mul_840",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1380",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1379",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "mul_820",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "cluster_root": "sum_183",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_820",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "sum_177",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "cluster_root": "div_93",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1380",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "div_91",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "cluster_root": "mul_841",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_91",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_177",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "mul_821",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "cluster_root": "sub_92",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1379",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_821",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "sub_89",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "cluster_root": "mul_842",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_89",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_77",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "mul_822",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1378",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1380",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "mul_823",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_823",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "sum_178",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "cluster_root": "convert_element_type_2149",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_822",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "convert_element_type_2104",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_178",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "convert_element_type_2105",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1372",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2104",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "add_367",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_2105",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "dtype_cast_557",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_557",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.ffn_norm",
+      "name": "alias_default_1448",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_367",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "alias_default_1381",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1381",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_72",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "einsum_default_639",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "cluster_root": "permute_1331",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_73",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "permute_1299",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "cluster_root": "einsum_default_654",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1381",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1299",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "einsum_default_640",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_639",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "permute_1300",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1300",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "dtype_cast_558",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_558",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wo",
+      "name": "alias_default_1443",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "cluster_root": "view_1532",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_640",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_1508",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "cluster_root": "permute_1333",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1508",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "permute_1301",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1301",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_68",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_69",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_70",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_71",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_19",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_24",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_25",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_29",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "cluster_root": "getitem_378",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.sdpa",
+      "name": "getitem_375",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "cluster_root": "getitem_379",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.sdpa",
+      "name": "getitem_376",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "cluster_root": "getitem_380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_29",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.sdpa",
+      "name": "getitem_377",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "cluster_root": "permute_1334",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_377",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "permute_1302",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "cluster_root": "permute_1335",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_376",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "permute_1303",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "cluster_root": "permute_1336",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_375",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "permute_1304",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "cluster_root": "view_1533",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1302",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_1509",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "cluster_root": "sum_185",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1509",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "sum_179",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "cluster_root": "squeeze_60",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_179",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "squeeze_58",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "cluster_root": "view_1534",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1303",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_1510",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "cluster_root": "sum_186",
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1510",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "sum_180",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "cluster_root": "squeeze_61",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_180",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "squeeze_59",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "cluster_root": "convert_element_type_2155",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_59",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "convert_element_type_2110",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "cluster_root": "convert_element_type_2156",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1304",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "convert_element_type_2111",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "cluster_root": "view_1535",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2110",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_1511",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "cluster_root": "view_as_complex_124",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1511",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_as_complex_122",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "cluster_root": "_conj_60",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_67",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "_conj_58",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "cluster_root": "clone_310",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_58",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "clone_302",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "cluster_root": "mul_844",
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_122",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_302",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "mul_824",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "cluster_root": "view_1536",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2111",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_1512",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "cluster_root": "view_as_complex_125",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1512",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_as_complex_123",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "cluster_root": "_conj_61",
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_67",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "_conj_59",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "cluster_root": "clone_311",
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_59",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "clone_303",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "cluster_root": "mul_845",
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_123",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_303",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "mul_825",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "cluster_root": "view_as_real_124",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_824",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_as_real_122",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "cluster_root": "view_1537",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_122",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_1513",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "cluster_root": "convert_element_type_2157",
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1513",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "convert_element_type_2112",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "cluster_root": "view_as_real_125",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_825",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_as_real_123",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "cluster_root": "view_1538",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_123",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_1514",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "cluster_root": "convert_element_type_2158",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1514",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "convert_element_type_2113",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "cluster_root": "view_1539",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_58",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_1515",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "cluster_root": "view_1540",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2112",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_1516",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "cluster_root": "view_1541",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2113",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "view_1517",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "cluster_root": "alias_default_1398",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1515",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "alias_default_1382",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1382",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_63",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wv",
+      "name": "einsum_default_641",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "cluster_root": "permute_1339",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_66",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wv",
+      "name": "permute_1307",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "cluster_root": "einsum_default_656",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1382",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1307",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wv",
+      "name": "einsum_default_642",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_641",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wv",
+      "name": "permute_1308",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1308",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wv",
+      "name": "dtype_cast_559",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_559",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wv",
+      "name": "alias_default_1442",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "cluster_root": "alias_default_1399",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1516",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "alias_default_1383",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1383",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_63",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wk",
+      "name": "einsum_default_643",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "cluster_root": "permute_1343",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_65",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wk",
+      "name": "permute_1311",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "cluster_root": "einsum_default_658",
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1383",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1311",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wk",
+      "name": "einsum_default_644",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_642",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_644",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "add_368",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_643",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wk",
+      "name": "permute_1312",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1312",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wk",
+      "name": "dtype_cast_560",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_560",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wk",
+      "name": "alias_default_1441",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "cluster_root": "alias_default_1400",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1517",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention",
+      "name": "alias_default_1384",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1384",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_63",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wq",
+      "name": "einsum_default_645",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "cluster_root": "permute_1347",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_64",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wq",
+      "name": "permute_1315",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "cluster_root": "einsum_default_660",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1384",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1315",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wq",
+      "name": "einsum_default_646",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_368",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_646",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2",
+      "name": "add_369",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_645",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wq",
+      "name": "permute_1316",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1316",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wq",
+      "name": "dtype_cast_561",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_561",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention.wq",
+      "name": "alias_default_1440",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "cluster_root": "convert_element_type_2171",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_369",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "convert_element_type_2126",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "cluster_root": "convert_element_type_2172",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_59",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "convert_element_type_2127",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "cluster_root": "convert_element_type_2173",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_60",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "convert_element_type_2128",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "cluster_root": "alias_default_1401",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2126",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "alias_default_1385",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "cluster_root": "mul_846",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1385",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_2128",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "mul_826",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "cluster_root": "mul_847",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2127",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_62",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "mul_827",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "cluster_root": "alias_default_1402",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_826",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "alias_default_1386",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "cluster_root": "alias_default_1403",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_827",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "alias_default_1387",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "cluster_root": "mul_848",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1387",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1386",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "mul_828",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "cluster_root": "sum_187",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_828",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "sum_181",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "cluster_root": "div_94",
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1387",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "div_92",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "cluster_root": "mul_849",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_92",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_181",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "mul_829",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "cluster_root": "sub_93",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1386",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_829",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "sub_90",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "cluster_root": "mul_850",
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_90",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_62",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "mul_830",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1385",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1387",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "mul_831",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_831",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "sum_182",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "cluster_root": "convert_element_type_2174",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_830",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "convert_element_type_2129",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_182",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "convert_element_type_2130",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1381",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2129",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "add_370",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_2130",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "dtype_cast_562",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_562",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.2.attention_norm",
+      "name": "alias_default_1447",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_370",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "alias_default_1388",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "cluster_root": "einsum_default_661",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1388",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_57",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "einsum_default_647",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 113,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_58",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "permute_1319",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 114,
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1388",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1319",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "einsum_default_648",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "cluster_root": "permute_1352",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_647",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "permute_1320",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "cluster_root": "dtype_cast_572",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1320",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "dtype_cast_563",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "cluster_root": "alias_default_1427",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_563",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "alias_default_1436",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 115,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_648",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w2",
+      "name": "alias_default_1389",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 116,
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1389",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_54",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "mul_832",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 117,
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1389",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_56",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "mul_833",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 118,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_832",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "alias_default_1390",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "cluster_root": "einsum_default_663",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1390",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_50",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "einsum_default_649",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 119,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_55",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "permute_1323",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 120,
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1390",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1323",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "einsum_default_650",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "cluster_root": "permute_1356",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_649",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "permute_1324",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "cluster_root": "dtype_cast_573",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1324",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "dtype_cast_564",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "cluster_root": "alias_default_1428",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_564",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w3",
+      "name": "alias_default_1437",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 121,
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_833",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "convert_element_type_2139",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 122,
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_52",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "convert_element_type_2140",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 123,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2140",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "alias_default_1391",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 124,
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1391",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "neg_62",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 125,
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_62",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "exp_62",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 126,
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_62",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "add_371",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 127,
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_371",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "reciprocal_30",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 128,
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_30",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "mul_834",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 129,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_834",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "alias_default_1392",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 130,
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2139",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1392",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "mul_835",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 131,
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1392",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "sub_91",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 132,
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1391",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_91",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "mul_836",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 133,
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_836",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "add_372",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 134,
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_835",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_372",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "mul_837",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 135,
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_837",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "convert_element_type_2141",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 136,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2141",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward",
+      "name": "alias_default_1393",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "cluster_root": "einsum_default_665",
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1393",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_50",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "einsum_default_651",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 137,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_51",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "permute_1327",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 138,
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1393",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1327",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "einsum_default_652",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 139,
+      "cluster_root": "add_163",
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_650",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_652",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1",
+      "name": "add_373",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "cluster_root": "permute_1360",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_651",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "permute_1328",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "cluster_root": "dtype_cast_574",
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1328",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "dtype_cast_565",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "cluster_root": "alias_default_1426",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_565",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.feed_forward.w1",
+      "name": "alias_default_1435",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 140,
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_373",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "convert_element_type_2146",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 141,
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_46",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "convert_element_type_2147",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 142,
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_47",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "convert_element_type_2148",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 143,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2146",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "alias_default_1394",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 144,
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1394",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_2148",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "mul_838",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 145,
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2147",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "mul_839",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 146,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_838",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "alias_default_1395",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 147,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_839",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "alias_default_1396",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 148,
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1396",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1395",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "mul_840",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 149,
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_840",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "sum_183",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 150,
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1396",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "div_93",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 151,
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_93",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_183",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "mul_841",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 152,
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1395",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_841",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "sub_92",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 153,
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_92",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_49",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "mul_842",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "cluster_root": "mul_863",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1394",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1396",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "mul_843",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "cluster_root": "sum_190",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_843",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "sum_184",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 154,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_842",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "convert_element_type_2149",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "cluster_root": "convert_element_type_2195",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_184",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "convert_element_type_2150",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 155,
+      "cluster_root": "add_164",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1388",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2149",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "add_374",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "cluster_root": "dtype_cast_575",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_2150",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "dtype_cast_566",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "cluster_root": "alias_default_1430",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_566",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.ffn_norm",
+      "name": "alias_default_1439",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 156,
+      "cluster_root": "alias_default_917",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_374",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "alias_default_1397",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "cluster_root": "einsum_default_667",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1397",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_44",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "einsum_default_653",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 157,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_45",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "permute_1331",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 158,
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1397",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1331",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "einsum_default_654",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "cluster_root": "permute_1364",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_653",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "permute_1332",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "cluster_root": "dtype_cast_576",
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1332",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "dtype_cast_567",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "cluster_root": "alias_default_1425",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_567",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wo",
+      "name": "alias_default_1434",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 159,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_654",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_1532",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 160,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1532",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "permute_1333",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 161,
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1333",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_40",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_41",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_42",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_43",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_10",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_15",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_16",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_30",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 162,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_30",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.sdpa",
+      "name": "getitem_378",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 163,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_30",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.sdpa",
+      "name": "getitem_379",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 164,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_30",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.sdpa",
+      "name": "getitem_380",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 165,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_380",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "permute_1334",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 166,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_379",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "permute_1335",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 167,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_378",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "permute_1336",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 168,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1334",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_1533",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 169,
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1533",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "sum_185",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 170,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_185",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "squeeze_60",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 171,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1335",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_1534",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 172,
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1534",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "sum_186",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 173,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_186",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "squeeze_61",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 174,
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_61",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "convert_element_type_2155",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 175,
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1336",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "convert_element_type_2156",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 176,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2155",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_1535",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 177,
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1535",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_as_complex_124",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 178,
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_39",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "_conj_60",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 179,
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_60",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "clone_310",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 180,
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_124",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_310",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "mul_844",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 181,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2156",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_1536",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 182,
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1536",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_as_complex_125",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 183,
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_39",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "_conj_61",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 184,
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_61",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "clone_311",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 185,
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_125",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_311",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "mul_845",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 186,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_844",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_as_real_124",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 187,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_124",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_1537",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 188,
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1537",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "convert_element_type_2157",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 189,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_845",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_as_real_125",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 190,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_125",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_1538",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 191,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1538",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "convert_element_type_2158",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 192,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_60",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_1539",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 193,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2157",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_1540",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 194,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2158",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "view_1541",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 195,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1539",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "alias_default_1398",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "cluster_root": "einsum_default_669",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1398",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_35",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wv",
+      "name": "einsum_default_655",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 196,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_38",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wv",
+      "name": "permute_1339",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 197,
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1398",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1339",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wv",
+      "name": "einsum_default_656",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "cluster_root": "permute_1372",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_655",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wv",
+      "name": "permute_1340",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "cluster_root": "dtype_cast_577",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1340",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wv",
+      "name": "dtype_cast_568",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "cluster_root": "alias_default_1424",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_568",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wv",
+      "name": "alias_default_1433",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 198,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1540",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "alias_default_1399",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "cluster_root": "einsum_default_671",
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1399",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_35",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wk",
+      "name": "einsum_default_657",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 199,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_37",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wk",
+      "name": "permute_1343",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 200,
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1399",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1343",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wk",
+      "name": "einsum_default_658",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 201,
+      "cluster_root": "add_165",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_656",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_658",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "add_375",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "cluster_root": "permute_1376",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_657",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wk",
+      "name": "permute_1344",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "cluster_root": "dtype_cast_578",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1344",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wk",
+      "name": "dtype_cast_569",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "cluster_root": "alias_default_1423",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_569",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wk",
+      "name": "alias_default_1432",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 202,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1541",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention",
+      "name": "alias_default_1400",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "cluster_root": "einsum_default_673",
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1400",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_35",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wq",
+      "name": "einsum_default_659",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 203,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_36",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wq",
+      "name": "permute_1347",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 204,
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1400",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1347",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wq",
+      "name": "einsum_default_660",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 205,
+      "cluster_root": "add_166",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_375",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_660",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1",
+      "name": "add_376",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "cluster_root": "permute_1380",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_659",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wq",
+      "name": "permute_1348",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "cluster_root": "dtype_cast_579",
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1348",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wq",
+      "name": "dtype_cast_570",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "cluster_root": "alias_default_1422",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_570",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention.wq",
+      "name": "alias_default_1431",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 206,
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_376",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "convert_element_type_2171",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 207,
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "convert_element_type_2172",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 208,
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_32",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "convert_element_type_2173",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 209,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2171",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "alias_default_1401",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 210,
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1401",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_2173",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "mul_846",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 211,
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2172",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_34",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "mul_847",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 212,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_846",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "alias_default_1402",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 213,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_847",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "alias_default_1403",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 214,
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1403",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1402",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "mul_848",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 215,
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_848",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "sum_187",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 216,
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1403",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "div_94",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 217,
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_94",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_187",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "mul_849",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 218,
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1402",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_849",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "sub_93",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 219,
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_93",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_34",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "mul_850",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "cluster_root": "mul_871",
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1401",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1403",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "mul_851",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "cluster_root": "sum_194",
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_851",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "sum_188",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 220,
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_850",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "convert_element_type_2174",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "cluster_root": "convert_element_type_2220",
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_188",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "convert_element_type_2175",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 221,
+      "cluster_root": "add_167",
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1397",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2174",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "add_377",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "cluster_root": "dtype_cast_580",
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_2175",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "dtype_cast_571",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "cluster_root": "alias_default_1429",
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_571",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.1.attention_norm",
+      "name": "alias_default_1438",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 222,
+      "cluster_root": "alias_default_924",
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)R",
+          "name": "add_377",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "alias_default_1404",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)R",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 109,
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1404",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_29",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "einsum_default_661",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "alias_default_30",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "permute_1351",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_1404",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "permute_1351",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "einsum_default_662",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 110,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "einsum_default_661",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "permute_1352",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 111,
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "permute_1352",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "dtype_cast_572",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 112,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(1)",
+          "name": "dtype_cast_572",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "alias_default_1427",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_662",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w2",
+      "name": "alias_default_1405",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1405",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_26",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "mul_852",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1405",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_28",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "mul_853",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_852",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "alias_default_1406",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 223,
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1406",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_22",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "einsum_default_663",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_27",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "permute_1355",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1406",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1355",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "einsum_default_664",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 224,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_663",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "permute_1356",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 225,
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1356",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "dtype_cast_573",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 226,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_573",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w3",
+      "name": "alias_default_1428",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_853",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "convert_element_type_2184",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 136.64587220149252,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_24",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "convert_element_type_2185",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2185",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "alias_default_1407",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1407",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "neg_63",
+      "op": "aten.neg.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "neg_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "exp_63",
+      "op": "aten.exp.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "exp_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "add_378",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_378",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "reciprocal_31",
+      "op": "aten.reciprocal.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "reciprocal_31",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "mul_854",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_854",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "alias_default_1408",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2184",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1408",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "mul_855",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1408",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "sub_94",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1407",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sub_94",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "mul_856",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 182.1944962686567,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_856",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "add_379",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 273.29174440298505,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_855",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "add_379",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "mul_857",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 136.64587220149252,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_857",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "convert_element_type_2186",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2186",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward",
+      "name": "alias_default_1409",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 227,
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1409",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_22",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "einsum_default_665",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        14336
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_23",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "permute_1359",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 694.8379851971689,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1409",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(0)",
+          "name": "permute_1359",
+          "src_placement": "RS(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "einsum_default_666",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 156.16671108742005,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_664",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)P(sum)",
+          "name": "einsum_default_666",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0",
+      "name": "add_380",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)P(sum)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 228,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_665",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "permute_1360",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 355
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 229,
+      "compute_cost": 34.16146805037313,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1360",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "dtype_cast_574",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 230,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 487.952,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_574",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.feed_forward.w1",
+      "name": "alias_default_1426",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 430.3685785129651,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_380",
+          "src_placement": "S(0)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "convert_element_type_2191",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_18",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "convert_element_type_2192",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_19",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "convert_element_type_2193",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2191",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "alias_default_1410",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1410",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_2193",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "mul_858",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2192",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "mul_859",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_858",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "alias_default_1411",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_859",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "alias_default_1412",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1412",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1411",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "mul_860",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_860",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "sum_189",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1412",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "div_95",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_95",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_189",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "mul_861",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1411",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_861",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "sub_95",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_95",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_21",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "mul_862",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 231,
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1410",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1412",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "mul_863",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 232,
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_863",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "sum_190",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_862",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "convert_element_type_2194",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 233,
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_190",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "convert_element_type_2195",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1404",
+          "src_placement": "S(0)R",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2194",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "add_381",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "cluster_id": 234,
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_2195",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "dtype_cast_575",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 235,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_575",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.ffn_norm",
+      "name": "alias_default_1430",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_381",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "alias_default_1413",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 236,
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1413",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_16",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "einsum_default_667",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_17",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "permute_1363",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1413",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1363",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "einsum_default_668",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 237,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "einsum_default_667",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "permute_1364",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return self.wo(output)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 316
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 238,
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "permute_1364",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "dtype_cast_576",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 239,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 331.9007188940092,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_576",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wo",
+      "name": "alias_default_1425",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(2)",
+          "name": "einsum_default_668",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_1556",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "output = output.view(bs, seqlen, -1)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 315
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1556",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "permute_1365",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "output = output.transpose(",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 312
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 1985.2513862776257,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "permute_1365",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_12",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_13",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_14",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_15",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_1",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_6",
+          "src_placement": "RR",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "getitem_7",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.sdpa",
+      "name": "_scaled_dot_product_flash_attention_backward_31",
+      "op": "aten._scaled_dot_product_flash_attention_backward.default",
+      "phase": "backward",
+      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.sdpa",
+      "name": "getitem_381",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.sdpa",
+      "name": "getitem_382",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "_scaled_dot_product_flash_attention_backward_31",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.sdpa",
+      "name": "getitem_383",
+      "op": "<built-in function getitem>",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        32,
+        8192,
+        128
+      ],
+      "source": {
+        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 53
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_383",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "permute_1366",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 308
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_382",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "permute_1367",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 307
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "getitem_381",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "permute_1368",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 306
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1366",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_1557",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1557",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "sum_191",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_191",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "squeeze_62",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1367",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_1558",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        4,
+        128
+      ],
+      "source": {
+        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 223
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 16.26736573827292,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1558",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "sum_192",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        1,
+        128
+      ],
+      "source": {
+        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 222
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "sum_192",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "squeeze_63",
+      "op": "aten.squeeze.dim",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "torch.unsqueeze(x, dim=3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "repeat_kv",
+        "line": 221
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_63",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "convert_element_type_2200",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "permute_1368",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "convert_element_type_2201",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 212
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2200",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_1559",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1559",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_as_complex_126",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_11",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "_conj_62",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_62",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "clone_318",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 14.64062916444563,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_126",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_318",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "mul_864",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64
+      ],
+      "source": {
+        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 211
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2201",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_1560",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1560",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_as_complex_127",
+      "op": "aten.view_as_complex.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_11",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "_conj_63",
+      "op": "aten._conj.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 7.0,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "_conj_63",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "clone_319",
+      "op": "aten.clone.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1,
+        8192,
+        1,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 53.68230693630064,
+      "dtype": "complex64",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_complex_127",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "clone_319",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "mul_865",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64
+      ],
+      "source": {
+        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 210
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_864",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_as_real_126",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_126",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_1561",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 9.760419442963753,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1561",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "convert_element_type_2202",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        8,
+        128
+      ],
+      "source": {
+        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 208
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "mul_865",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_as_real_127",
+      "op": "aten.view_as_real.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        64,
+        2
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_as_real_127",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_1562",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1562",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "convert_element_type_2203",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        32,
+        128
+      ],
+      "source": {
+        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "apply_rotary_emb",
+        "line": 207
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "squeeze_62",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_1563",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2202",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_1564",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "convert_element_type_2203",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "view_1565",
+      "op": "aten.view.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1563",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "alias_default_1414",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 297
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 240,
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1414",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_7",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wv",
+      "name": "einsum_default_669",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_10",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wv",
+      "name": "permute_1371",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1414",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 36.328589861751155,
+          "dst_placement": "RR",
+          "name": "permute_1371",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wv",
+      "name": "einsum_default_670",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "cluster_id": 241,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_669",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wv",
+      "name": "permute_1372",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 242,
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1372",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wv",
+      "name": "dtype_cast_577",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 243,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_577",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wv",
+      "name": "alias_default_1424",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1564",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "alias_default_1415",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        1024
+      ],
+      "source": {
+        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 296
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 244,
+      "compute_cost": 56.12241179704158,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1415",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_7",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wk",
+      "name": "einsum_default_671",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        1024
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_9",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wk",
+      "name": "permute_1375",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 49.631284656940636,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 86.07528421052632,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1415",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "permute_1375",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wk",
+      "name": "einsum_default_672",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_670",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_672",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "add_382",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 245,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_671",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wk",
+      "name": "permute_1376",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 246,
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1376",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wk",
+      "name": "dtype_cast_578",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 247,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 57.40529711375213,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_578",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wk",
+      "name": "alias_default_1423",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "view_1565",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention",
+      "name": "alias_default_1416",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(2)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 295
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 248,
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(2)",
+          "name": "alias_default_1416",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)R",
+          "name": "alias_default_7",
+          "src_placement": "S(0)R",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wq",
+      "name": "einsum_default_673",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "P(sum)S(1)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RS(1)",
+          "name": "alias_default_8",
+          "src_placement": "RS(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wq",
+      "name": "permute_1379",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "RS(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 198.52513862776254,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 190.35670720457864,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1416",
+          "src_placement": "S(0)S(2)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 94.3143594470046,
+          "dst_placement": "RR",
+          "name": "permute_1379",
+          "src_placement": "RS(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wq",
+      "name": "einsum_default_674",
+      "op": "aten.einsum.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 2.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_382",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "einsum_default_674",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0",
+      "name": "add_383",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 249,
+      "compute_cost": 0.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(1)",
+          "name": "einsum_default_673",
+          "src_placement": "P(sum)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wq",
+      "name": "permute_1380",
+      "op": "aten.permute.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 290
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 250,
+      "compute_cost": 9.760419442963753,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)S(0)",
+          "name": "permute_1380",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wq",
+      "name": "dtype_cast_579",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 251,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 160.272,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_579",
+          "src_placement": "P(sum)S(0)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention.wq",
+      "name": "alias_default_1422",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "add_383",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "convert_element_type_2216",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_3",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "convert_element_type_2217",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_4",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "convert_element_type_2218",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "RR",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2216",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "alias_default_1417",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1417",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "convert_element_type_2218",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "mul_866",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2217",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "mul_867",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_866",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "alias_default_1418",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_867",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "alias_default_1419",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1419",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1418",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "mul_868",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_868",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "sum_193",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        1
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.05557036247335,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1419",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "div_96",
+      "op": "aten.div.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "div_96",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sum_193",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "mul_869",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1418",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_869",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "sub_96",
+      "op": "aten.sub.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 52.06192480221486,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "sub_96",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_6",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "mul_870",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 252,
+      "compute_cost": 78.08335554371003,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1417",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1419",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "mul_871",
+      "op": "aten.mul.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 253,
+      "compute_cost": 26.034139620978188,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_871",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "sum_194",
+      "op": "aten.sum.dim_IntList",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "mul_870",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "convert_element_type_2219",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 254,
+      "compute_cost": 7.0,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "sum_194",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "convert_element_type_2220",
+      "op": "prims.convert_element_type.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 39.041677771855014,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "alias_default_1413",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(0)S(1)",
+          "name": "convert_element_type_2219",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "name": "add_384",
+      "op": "aten.add.Tensor",
+      "phase": "backward",
+      "placement": "S(0)S(1)",
+      "shape": [
+        8,
+        8192,
+        4096
+      ],
+      "source": {
+        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
+        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
+        "func": "rms_norm",
+        "line": 2964
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 255,
+      "compute_cost": 7.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "P(sum)P(sum)",
+          "name": "convert_element_type_2220",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "dtype_cast_580",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "P(sum)P(sum)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "cluster_id": 256,
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 28.41652038284296,
+          "dst_placement": "S(0)S(0)",
+          "name": "dtype_cast_580",
+          "src_placement": "P(sum)P(sum)",
+          "transition_cost": 1
+        }
+      ],
+      "module_path": "L['self'].layers.0.attention_norm",
+      "name": "alias_default_1429",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 38.685829146330285,
+      "dtype": "bfloat16",
+      "inputs": [
+        {
+          "comm_cost": 706.2108351658422,
+          "dst_placement": "S(2)S(2)",
+          "name": "add_384",
+          "src_placement": "S(0)S(1)",
+          "transition_cost": 1
+        },
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "RR",
+          "name": "alias_default_1",
+          "src_placement": "RR",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].tok_embeddings",
+      "name": "embedding_dense_backward",
+      "op": "aten.embedding_dense_backward.default",
+      "phase": "backward",
+      "placement": "S(1)S(1)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "source": {
+        "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
+        "func": "forward",
+        "line": 539
+      },
+      "transition_cost": 1.0
+    },
+    {
+      "compute_cost": 76.40578345195063,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(1)S(1)",
+          "name": "embedding_dense_backward",
+          "src_placement": "S(1)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].tok_embeddings",
+      "name": "dtype_cast_581",
+      "op": "autoparallel.dtype_cast.default",
+      "phase": "backward",
+      "placement": "S(1)S(1)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "compute_cost": 0.0,
+      "dtype": "float32",
+      "inputs": [
+        {
+          "comm_cost": 0.0,
+          "dst_placement": "S(1)S(1)",
+          "name": "dtype_cast_581",
+          "src_placement": "S(1)S(1)",
+          "transition_cost": 0
+        }
+      ],
+      "module_path": "L['self'].tok_embeddings",
+      "name": "alias_default_1421",
+      "op": "aten.alias.default",
+      "phase": "backward",
+      "placement": "S(1)S(1)",
+      "shape": [
+        128256,
+        4096
+      ],
+      "source": {
+        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
+        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
+        "func": "getter",
+        "line": 25
+      },
+      "transition_cost": 0.0
+    },
+    {
+      "inputs": [
+        {
+          "name": "alias_default_1420"
+        },
+        {
+          "name": "alias_default_1421"
+        },
+        {
+          "name": "alias_default_1422"
+        },
+        {
+          "name": "alias_default_1423"
+        },
+        {
+          "name": "alias_default_1424"
+        },
+        {
+          "name": "alias_default_1425"
+        },
+        {
+          "name": "alias_default_1426"
+        },
+        {
+          "name": "alias_default_1427"
+        },
+        {
+          "name": "alias_default_1428"
+        },
+        {
+          "name": "alias_default_1429"
+        },
+        {
+          "name": "alias_default_1430"
+        },
+        {
+          "name": "alias_default_1431"
+        },
+        {
+          "name": "alias_default_1432"
+        },
+        {
+          "name": "alias_default_1433"
+        },
+        {
+          "name": "alias_default_1434"
+        },
+        {
+          "name": "alias_default_1435"
+        },
+        {
+          "name": "alias_default_1436"
+        },
+        {
+          "name": "alias_default_1437"
+        },
+        {
+          "name": "alias_default_1438"
+        },
+        {
+          "name": "alias_default_1439"
+        },
+        {
+          "name": "alias_default_1440"
+        },
+        {
+          "name": "alias_default_1441"
+        },
+        {
+          "name": "alias_default_1442"
+        },
+        {
+          "name": "alias_default_1443"
+        },
+        {
+          "name": "alias_default_1444"
+        },
+        {
+          "name": "alias_default_1445"
+        },
+        {
+          "name": "alias_default_1446"
+        },
+        {
+          "name": "alias_default_1447"
+        },
+        {
+          "name": "alias_default_1448"
+        },
+        {
+          "name": "alias_default_1449"
+        },
+        {
+          "name": "alias_default_1450"
+        },
+        {
+          "name": "alias_default_1451"
+        },
+        {
+          "name": "alias_default_1452"
+        },
+        {
+          "name": "alias_default_1453"
+        },
+        {
+          "name": "alias_default_1454"
+        },
+        {
+          "name": "alias_default_1455"
+        },
+        {
+          "name": "alias_default_1456"
+        },
+        {
+          "name": "alias_default_1457"
+        },
+        {
+          "name": "alias_default_1458"
+        },
+        {
+          "name": "alias_default_1459"
+        },
+        {
+          "name": "alias_default_1460"
+        },
+        {
+          "name": "alias_default_1461"
+        },
+        {
+          "name": "alias_default_1462"
+        },
+        {
+          "name": "alias_default_1463"
+        },
+        {
+          "name": "alias_default_1464"
+        },
+        {
+          "name": "alias_default_1465"
+        },
+        {
+          "name": "alias_default_1466"
+        },
+        {
+          "name": "alias_default_1467"
+        },
+        {
+          "name": "alias_default_1468"
+        },
+        {
+          "name": "alias_default_1469"
+        },
+        {
+          "name": "alias_default_1470"
+        },
+        {
+          "name": "alias_default_1471"
+        },
+        {
+          "name": "alias_default_1472"
+        },
+        {
+          "name": "alias_default_1473"
+        },
+        {
+          "name": "alias_default_1474"
+        },
+        {
+          "name": "alias_default_1475"
+        },
+        {
+          "name": "alias_default_1476"
+        },
+        {
+          "name": "alias_default_1477"
+        },
+        {
+          "name": "alias_default_1478"
+        },
+        {
+          "name": "alias_default_1479"
+        },
+        {
+          "name": "alias_default_1480"
+        },
+        {
+          "name": "alias_default_1481"
+        },
+        {
+          "name": "alias_default_1482"
+        },
+        {
+          "name": "alias_default_1483"
+        },
+        {
+          "name": "alias_default_1484"
+        },
+        {
+          "name": "alias_default_1485"
+        },
+        {
+          "name": "alias_default_1486"
+        },
+        {
+          "name": "alias_default_1487"
+        },
+        {
+          "name": "alias_default_1488"
+        },
+        {
+          "name": "alias_default_1489"
+        },
+        {
+          "name": "alias_default_1490"
+        },
+        {
+          "name": "alias_default_1491"
+        },
+        {
+          "name": "alias_default_1492"
+        },
+        {
+          "name": "alias_default_1493"
+        },
+        {
+          "name": "alias_default_1494"
+        },
+        {
+          "name": "alias_default_1495"
+        },
+        {
+          "name": "alias_default_1496"
+        },
+        {
+          "name": "alias_default_1497"
+        },
+        {
+          "name": "alias_default_1498"
+        },
+        {
+          "name": "alias_default_1499"
+        },
+        {
+          "name": "alias_default_1500"
+        },
+        {
+          "name": "alias_default_1501"
+        },
+        {
+          "name": "alias_default_1502"
+        },
+        {
+          "name": "alias_default_1503"
+        },
+        {
+          "name": "alias_default_1504"
+        },
+        {
+          "name": "alias_default_1505"
+        },
+        {
+          "name": "alias_default_1506"
+        },
+        {
+          "name": "alias_default_1507"
+        },
+        {
+          "name": "alias_default_1508"
+        },
+        {
+          "name": "alias_default_1509"
+        },
+        {
+          "name": "alias_default_1510"
+        },
+        {
+          "name": "alias_default_1511"
+        },
+        {
+          "name": "alias_default_1512"
+        },
+        {
+          "name": "alias_default_1513"
+        },
+        {
+          "name": "alias_default_1514"
+        },
+        {
+          "name": "alias_default_1515"
+        },
+        {
+          "name": "alias_default_1516"
+        },
+        {
+          "name": "alias_default_1517"
+        },
+        {
+          "name": "alias_default_1518"
+        },
+        {
+          "name": "alias_default_1519"
+        },
+        {
+          "name": "alias_default_1520"
+        },
+        {
+          "name": "alias_default_1521"
+        },
+        {
+          "name": "alias_default_1522"
+        },
+        {
+          "name": "alias_default_1523"
+        },
+        {
+          "name": "alias_default_1524"
+        },
+        {
+          "name": "alias_default_1525"
+        },
+        {
+          "name": "alias_default_1526"
+        },
+        {
+          "name": "alias_default_1527"
+        },
+        {
+          "name": "alias_default_1528"
+        },
+        {
+          "name": "alias_default_1529"
+        },
+        {
+          "name": "alias_default_1530"
+        },
+        {
+          "name": "alias_default_1531"
+        },
+        {
+          "name": "alias_default_1532"
+        },
+        {
+          "name": "alias_default_1533"
+        },
+        {
+          "name": "alias_default_1534"
+        },
+        {
+          "name": "alias_default_1535"
+        },
+        {
+          "name": "alias_default_1536"
+        },
+        {
+          "name": "alias_default_1537"
+        },
+        {
+          "name": "alias_default_1538"
+        },
+        {
+          "name": "alias_default_1539"
+        },
+        {
+          "name": "alias_default_1540"
+        },
+        {
+          "name": "alias_default_1541"
+        },
+        {
+          "name": "alias_default_1542"
+        },
+        {
+          "name": "alias_default_1543"
+        },
+        {
+          "name": "alias_default_1544"
+        },
+        {
+          "name": "alias_default_1545"
+        },
+        {
+          "name": "alias_default_1546"
+        },
+        {
+          "name": "alias_default_1547"
+        },
+        {
+          "name": "alias_default_1548"
+        },
+        {
+          "name": "alias_default_1549"
+        },
+        {
+          "name": "alias_default_1550"
+        },
+        {
+          "name": "alias_default_1551"
+        },
+        {
+          "name": "alias_default_1552"
+        },
+        {
+          "name": "alias_default_1553"
+        },
+        {
+          "name": "alias_default_1554"
+        },
+        {
+          "name": "alias_default_1555"
+        },
+        {
+          "name": "alias_default_1556"
+        },
+        {
+          "name": "alias_default_1557"
+        },
+        {
+          "name": "alias_default_1558"
+        },
+        {
+          "name": "alias_default_1559"
+        },
+        {
+          "name": "alias_default_1560"
+        },
+        {
+          "name": "alias_default_1561"
+        },
+        {
+          "name": "alias_default_1562"
+        },
+        {
+          "name": "alias_default_1563"
+        },
+        {
+          "name": "alias_default_1564"
+        },
+        {
+          "name": "alias_default_1565"
+        },
+        {
+          "name": "alias_default_1566"
+        },
+        {
+          "name": "alias_default_1567"
+        },
+        {
+          "name": "alias_default_1568"
+        },
+        {
+          "name": "alias_default_1569"
+        },
+        {
+          "name": "alias_default_1570"
+        },
+        {
+          "name": "alias_default_1571"
+        },
+        {
+          "name": "alias_default_1572"
+        },
+        {
+          "name": "alias_default_1573"
+        },
+        {
+          "name": "alias_default_1574"
+        },
+        {
+          "name": "alias_default_1575"
+        },
+        {
+          "name": "alias_default_1576"
+        },
+        {
+          "name": "alias_default_1577"
+        },
+        {
+          "name": "alias_default_1578"
+        },
+        {
+          "name": "alias_default_1579"
+        },
+        {
+          "name": "alias_default_1580"
+        },
+        {
+          "name": "alias_default_1581"
+        },
+        {
+          "name": "alias_default_1582"
+        },
+        {
+          "name": "alias_default_1583"
+        },
+        {
+          "name": "alias_default_1584"
+        },
+        {
+          "name": "alias_default_1585"
+        },
+        {
+          "name": "alias_default_1586"
+        },
+        {
+          "name": "alias_default_1587"
+        },
+        {
+          "name": "alias_default_1588"
+        },
+        {
+          "name": "alias_default_1589"
+        },
+        {
+          "name": "alias_default_1590"
+        },
+        {
+          "name": "alias_default_1591"
+        },
+        {
+          "name": "alias_default_1592"
+        },
+        {
+          "name": "alias_default_1593"
+        },
+        {
+          "name": "alias_default_1594"
+        },
+        {
+          "name": "alias_default_1595"
+        },
+        {
+          "name": "alias_default_1596"
+        },
+        {
+          "name": "alias_default_1597"
+        },
+        {
+          "name": "alias_default_1598"
+        },
+        {
+          "name": "alias_default_1599"
+        },
+        {
+          "name": "alias_default_1600"
+        },
+        {
+          "name": "alias_default_1601"
+        },
+        {
+          "name": "alias_default_1602"
+        },
+        {
+          "name": "alias_default_1603"
+        },
+        {
+          "name": "alias_default_1604"
+        },
+        {
+          "name": "alias_default_1605"
+        },
+        {
+          "name": "alias_default_1606"
+        },
+        {
+          "name": "alias_default_1607"
+        },
+        {
+          "name": "alias_default_1608"
+        },
+        {
+          "name": "alias_default_1609"
+        },
+        {
+          "name": "alias_default_1610"
+        },
+        {
+          "name": "alias_default_1611"
+        },
+        {
+          "name": "alias_default_1612"
+        },
+        {
+          "name": "alias_default_1613"
+        },
+        {
+          "name": "alias_default_1614"
+        },
+        {
+          "name": "alias_default_1615"
+        },
+        {
+          "name": "alias_default_1616"
+        },
+        {
+          "name": "alias_default_1617"
+        },
+        {
+          "name": "alias_default_1618"
+        },
+        {
+          "name": "alias_default_1619"
+        },
+        {
+          "name": "alias_default_1620"
+        },
+        {
+          "name": "alias_default_1621"
+        },
+        {
+          "name": "alias_default_1622"
+        },
+        {
+          "name": "alias_default_1623"
+        },
+        {
+          "name": "alias_default_1624"
+        },
+        {
+          "name": "alias_default_1625"
+        },
+        {
+          "name": "alias_default_1626"
+        },
+        {
+          "name": "alias_default_1627"
+        },
+        {
+          "name": "alias_default_1628"
+        },
+        {
+          "name": "alias_default_1629"
+        },
+        {
+          "name": "alias_default_1630"
+        },
+        {
+          "name": "alias_default_1631"
+        },
+        {
+          "name": "alias_default_1632"
+        },
+        {
+          "name": "alias_default_1633"
+        },
+        {
+          "name": "alias_default_1634"
+        },
+        {
+          "name": "alias_default_1635"
+        },
+        {
+          "name": "alias_default_1636"
+        },
+        {
+          "name": "alias_default_1637"
+        },
+        {
+          "name": "alias_default_1638"
+        },
+        {
+          "name": "alias_default_1639"
+        },
+        {
+          "name": "alias_default_1640"
+        },
+        {
+          "name": "alias_default_1641"
+        },
+        {
+          "name": "alias_default_1642"
+        },
+        {
+          "name": "alias_default_1643"
+        },
+        {
+          "name": "alias_default_1644"
+        },
+        {
+          "name": "alias_default_1645"
+        },
+        {
+          "name": "alias_default_1646"
+        },
+        {
+          "name": "alias_default_1647"
+        },
+        {
+          "name": "alias_default_1648"
+        },
+        {
+          "name": "alias_default_1649"
+        },
+        {
+          "name": "alias_default_1650"
+        },
+        {
+          "name": "alias_default_1651"
+        },
+        {
+          "name": "alias_default_1652"
+        },
+        {
+          "name": "alias_default_1653"
+        },
+        {
+          "name": "alias_default_1654"
+        },
+        {
+          "name": "alias_default_1655"
+        },
+        {
+          "name": "alias_default_1656"
+        },
+        {
+          "name": "alias_default_1657"
+        },
+        {
+          "name": "alias_default_1658"
+        },
+        {
+          "name": "alias_default_1659"
+        },
+        {
+          "name": "alias_default_1660"
+        },
+        {
+          "name": "alias_default_1661"
+        },
+        {
+          "name": "alias_default_1662"
+        },
+        {
+          "name": "alias_default_1663"
+        },
+        {
+          "name": "alias_default_1664"
+        },
+        {
+          "name": "alias_default_1665"
+        },
+        {
+          "name": "alias_default_1666"
+        },
+        {
+          "name": "alias_default_1667"
+        },
+        {
+          "name": "alias_default_1668"
+        },
+        {
+          "name": "alias_default_1669"
+        },
+        {
+          "name": "alias_default_1670"
+        },
+        {
+          "name": "alias_default_1671"
+        },
+        {
+          "name": "alias_default_1672"
+        },
+        {
+          "name": "alias_default_1673"
+        },
+        {
+          "name": "alias_default_1674"
+        },
+        {
+          "name": "alias_default_1675"
+        },
+        {
+          "name": "alias_default_1676"
+        },
+        {
+          "name": "alias_default_1677"
+        },
+        {
+          "name": "alias_default_1678"
+        },
+        {
+          "name": "alias_default_1679"
+        },
+        {
+          "name": "alias_default_1680"
+        },
+        {
+          "name": "alias_default_1681"
+        },
+        {
+          "name": "alias_default_1682"
+        },
+        {
+          "name": "alias_default_1683"
+        },
+        {
+          "name": "alias_default_1684"
+        },
+        {
+          "name": "alias_default_1685"
+        },
+        {
+          "name": "alias_default_1686"
+        },
+        {
+          "name": "alias_default_1687"
+        },
+        {
+          "name": "alias_default_1688"
+        },
+        {
+          "name": "alias_default_1689"
+        },
+        {
+          "name": "alias_default_1690"
+        },
+        {
+          "name": "alias_default_1691"
+        },
+        {
+          "name": "alias_default_1692"
+        },
+        {
+          "name": "alias_default_1693"
+        },
+        {
+          "name": "alias_default_1694"
+        },
+        {
+          "name": "alias_default_1695"
+        },
+        {
+          "name": "alias_default_1696"
+        },
+        {
+          "name": "alias_default_1697"
+        },
+        {
+          "name": "alias_default_1698"
+        },
+        {
+          "name": "alias_default_1699"
+        },
+        {
+          "name": "alias_default_1700"
+        },
+        {
+          "name": "alias_default_1701"
+        },
+        {
+          "name": "alias_default_1702"
+        },
+        {
+          "name": "alias_default_1703"
+        },
+        {
+          "name": "alias_default_1704"
+        },
+        {
+          "name": "alias_default_1705"
+        },
+        {
+          "name": "alias_default_1706"
+        },
+        {
+          "name": "alias_default_1707"
+        },
+        {
+          "name": "alias_default_1708"
+        },
+        {
+          "name": "alias_default_1709"
+        },
+        {
+          "name": "alias_default_1710"
+        },
+        {
+          "name": "alias_default_1711"
+        }
+      ],
+      "name": "output",
+      "op": "output"
+    }
+  ],
+  "summary": {
+    "comm": 212780.17498325979,
+    "compute": 581120.8234224034,
+    "total": 794933.9984056632,
+    "transition": 1033.0
+  }
+}
\ No newline at end of file
diff --git a/profile_results/llama3_8b_4x4_strategy_summary.json b/profile_results/llama3_8b_4x4_strategy_summary.json
new file mode 100644
index 00000000..ccdeb4d9
--- /dev/null
+++ b/profile_results/llama3_8b_4x4_strategy_summary.json
@@ -0,0 +1,2054 @@
+{
+  "config": {
+    "batch_size": 8,
+    "input_constraint": "Shard(0), Replicate()",
+    "mesh_dim_names": [
+      "dp",
+      "tp"
+    ],
+    "mesh_shape": [
+      4,
+      4
+    ],
+    "model": "autoparallel._testing.models.llama3 Transformer 8B config",
+    "output_constraint": "Shard(0), Shard(2)",
+    "seqlen": 8192,
+    "vocab_size": 128256,
+    "world_size": 16
+  },
+  "elapsed_s": 115.23945621983148,
+  "json_summary": {
+    "comm": 212780.17498325979,
+    "compute": 581120.8234224034,
+    "total": 794933.9984056632,
+    "transition": 1033.0
+  },
+  "optimizer_profile": {
+    "ilp": {
+      "cluster_copied_decision_variables": 8181840,
+      "constraints": 175408,
+      "logical_decision_variables": 8657526,
+      "unique_variables": 475686
+    },
+    "last_solve": {
+      "constraints": 175412,
+      "extract_s": 0.044429945992305875,
+      "kind": "solve",
+      "objective": 794933.998405679,
+      "objective_s": 3.8023465629667044,
+      "pipeline_total_s": 102.16174313612282,
+      "solve_s": 59.80278266593814,
+      "status": "Optimal",
+      "total_s": 63.73084603413008,
+      "unique_variables": 475686
+    },
+    "mesh": {
+      "dim_names": [
+        "dp",
+        "tp"
+      ],
+      "ndim": 2,
+      "shape": [
+        4,
+        4
+      ],
+      "size": 16
+    },
+    "model": {
+      "graph_nodes": 8668,
+      "op_counts": {
+        "call_function": 8373,
+        "output": 1,
+        "placeholder": 294
+      },
+      "parameter_bytes": 32121044992,
+      "parameter_nodes": 291,
+      "parameter_numel": 8030261248,
+      "tensor_nodes": 8667,
+      "unknown_parameter_nodes": 0
+    },
+    "strategies": {
+      "max_strategies_per_node": 81,
+      "nodes": 8668,
+      "option_tuples": 8657526,
+      "strategy_options": 220687
+    },
+    "timings": {
+      "compute_cost_estimation_s": 1.9735342266503721,
+      "constraint_construction_s": 3.2506618059705943,
+      "cost_estimation_s": 4.9254587206523865,
+      "decision_var_build_s": 15.363263476872817,
+      "decision_var_overhead_s": 6.9146421970799565,
+      "edge_cost_estimation_s": 2.9519244940020144,
+      "ilp_construction_s": 13.688466562191024,
+      "init_total_s": 38.43089710199274,
+      "pulp_var_creation_s": 3.5231625591404736,
+      "strategy_enumeration_s": 10.847158421995118,
+      "validation_s": 0.060926787089556456
+    }
+  },
+  "param_strategy_groups": {
+    "layers.*.attention.wk.weight": {
+      "S(0)S(0)": 32
+    },
+    "layers.*.attention.wo.weight": {
+      "S(0)S(0)": 32
+    },
+    "layers.*.attention.wq.weight": {
+      "S(0)S(0)": 32
+    },
+    "layers.*.attention.wv.weight": {
+      "S(0)S(0)": 32
+    },
+    "layers.*.attention_norm.weight": {
+      "S(0)S(0)": 32
+    },
+    "layers.*.feed_forward.w1.weight": {
+      "S(0)S(0)": 32
+    },
+    "layers.*.feed_forward.w2.weight": {
+      "S(0)S(1)": 32
+    },
+    "layers.*.feed_forward.w3.weight": {
+      "S(0)S(0)": 32
+    },
+    "layers.*.ffn_norm.weight": {
+      "S(0)S(0)": 32
+    },
+    "norm.weight": {
+      "S(0)S(0)": 1
+    },
+    "output.weight": {
+      "S(0)S(0)": 1
+    },
+    "tok_embeddings.weight": {
+      "S(1)S(1)": 1
+    }
+  },
+  "phase_placement_counts": {
+    "backward": [
+      [
+        "S(0)S(2)",
+        1634
+      ],
+      [
+        "S(0)S(1)",
+        1423
+      ],
+      [
+        "P(sum)S(0)",
+        354
+      ],
+      [
+        "P(sum)P(sum)",
+        291
+      ],
+      [
+        "S(0)S(0)",
+        258
+      ],
+      [
+        "RR",
+        257
+      ],
+      [
+        "P(sum)S(1)",
+        225
+      ],
+      [
+        "RS(0)",
+        129
+      ],
+      [
+        "S(0)P(sum)",
+        97
+      ],
+      [
+        "S(0)R",
+        32
+      ],
+      [
+        "RS(1)",
+        32
+      ],
+      [
+        "(S(0)S(1), S(0)S(1), S(0)S(1))",
+        32
+      ],
+      [
+        "S(1)S(1)",
+        3
+      ]
+    ],
+    "forward": [
+      [
+        "S(0)S(2)",
+        1378
+      ],
+      [
+        "S(0)S(1)",
+        1227
+      ],
+      [
+        "S(0)S(0)",
+        516
+      ],
+      [
+        "RR",
+        324
+      ],
+      [
+        "RS(1)",
+        258
+      ],
+      [
+        "S(0)R",
+        66
+      ],
+      [
+        "RS(0)",
+        64
+      ],
+      [
+        "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+        32
+      ],
+      [
+        "S(0)P(sum)",
+        32
+      ],
+      [
+        "S(1)S(1)",
+        2
+      ],
+      [
+        "S(2)S(2)",
+        1
+      ]
+    ]
+  },
+  "placement_counts": [
+    [
+      "S(0)S(2)",
+      3012
+    ],
+    [
+      "S(0)S(1)",
+      2650
+    ],
+    [
+      "S(0)S(0)",
+      774
+    ],
+    [
+      "RR",
+      581
+    ],
+    [
+      "P(sum)S(0)",
+      354
+    ],
+    [
+      "P(sum)P(sum)",
+      291
+    ],
+    [
+      "RS(1)",
+      290
+    ],
+    [
+      "P(sum)S(1)",
+      225
+    ],
+    [
+      "RS(0)",
+      193
+    ],
+    [
+      "S(0)P(sum)",
+      129
+    ],
+    [
+      "S(0)R",
+      98
+    ],
+    [
+      "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
+      32
+    ],
+    [
+      "(S(0)S(1), S(0)S(1), S(0)S(1))",
+      32
+    ],
+    [
+      "S(1)S(1)",
+      5
+    ],
+    [
+      "S(2)S(2)",
+      1
+    ]
+  ],
+  "sample_forward_interesting_nodes": [
+    {
+      "inputs": [],
+      "module_path": "layers.0.attention.wq.weight",
+      "name": "primals_2",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.0.attention.wk.weight",
+      "name": "primals_3",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.0.attention.wv.weight",
+      "name": "primals_4",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.0.attention.wo.weight",
+      "name": "primals_5",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.0.feed_forward.w1.weight",
+      "name": "primals_6",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.0.feed_forward.w2.weight",
+      "name": "primals_7",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.0.feed_forward.w3.weight",
+      "name": "primals_8",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.1.attention.wq.weight",
+      "name": "primals_11",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.1.attention.wk.weight",
+      "name": "primals_12",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.1.attention.wv.weight",
+      "name": "primals_13",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.1.attention.wo.weight",
+      "name": "primals_14",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.1.feed_forward.w1.weight",
+      "name": "primals_15",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.1.feed_forward.w2.weight",
+      "name": "primals_16",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.1.feed_forward.w3.weight",
+      "name": "primals_17",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.2.attention.wq.weight",
+      "name": "primals_20",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.2.attention.wk.weight",
+      "name": "primals_21",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.2.attention.wv.weight",
+      "name": "primals_22",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.2.attention.wo.weight",
+      "name": "primals_23",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.2.feed_forward.w1.weight",
+      "name": "primals_24",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.2.feed_forward.w2.weight",
+      "name": "primals_25",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.2.feed_forward.w3.weight",
+      "name": "primals_26",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.3.attention.wq.weight",
+      "name": "primals_29",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.3.attention.wk.weight",
+      "name": "primals_30",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.3.attention.wv.weight",
+      "name": "primals_31",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.3.attention.wo.weight",
+      "name": "primals_32",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.3.feed_forward.w1.weight",
+      "name": "primals_33",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.3.feed_forward.w2.weight",
+      "name": "primals_34",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.3.feed_forward.w3.weight",
+      "name": "primals_35",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.4.attention.wq.weight",
+      "name": "primals_38",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.4.attention.wk.weight",
+      "name": "primals_39",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.4.attention.wv.weight",
+      "name": "primals_40",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.4.attention.wo.weight",
+      "name": "primals_41",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.4.feed_forward.w1.weight",
+      "name": "primals_42",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.4.feed_forward.w2.weight",
+      "name": "primals_43",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.4.feed_forward.w3.weight",
+      "name": "primals_44",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.5.attention.wq.weight",
+      "name": "primals_47",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.5.attention.wk.weight",
+      "name": "primals_48",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.5.attention.wv.weight",
+      "name": "primals_49",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.5.attention.wo.weight",
+      "name": "primals_50",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.5.feed_forward.w1.weight",
+      "name": "primals_51",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.5.feed_forward.w2.weight",
+      "name": "primals_52",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.5.feed_forward.w3.weight",
+      "name": "primals_53",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.6.attention.wq.weight",
+      "name": "primals_56",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.6.attention.wk.weight",
+      "name": "primals_57",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.6.attention.wv.weight",
+      "name": "primals_58",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.6.attention.wo.weight",
+      "name": "primals_59",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.6.feed_forward.w1.weight",
+      "name": "primals_60",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.6.feed_forward.w2.weight",
+      "name": "primals_61",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.6.feed_forward.w3.weight",
+      "name": "primals_62",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.7.attention.wq.weight",
+      "name": "primals_65",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.7.attention.wk.weight",
+      "name": "primals_66",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.7.attention.wv.weight",
+      "name": "primals_67",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.7.attention.wo.weight",
+      "name": "primals_68",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.7.feed_forward.w1.weight",
+      "name": "primals_69",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.7.feed_forward.w2.weight",
+      "name": "primals_70",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.7.feed_forward.w3.weight",
+      "name": "primals_71",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.8.attention.wq.weight",
+      "name": "primals_74",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.8.attention.wk.weight",
+      "name": "primals_75",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.8.attention.wv.weight",
+      "name": "primals_76",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.8.attention.wo.weight",
+      "name": "primals_77",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.8.feed_forward.w1.weight",
+      "name": "primals_78",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.8.feed_forward.w2.weight",
+      "name": "primals_79",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.8.feed_forward.w3.weight",
+      "name": "primals_80",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.9.attention.wq.weight",
+      "name": "primals_83",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.9.attention.wk.weight",
+      "name": "primals_84",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.9.attention.wv.weight",
+      "name": "primals_85",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.9.attention.wo.weight",
+      "name": "primals_86",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.9.feed_forward.w1.weight",
+      "name": "primals_87",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.9.feed_forward.w2.weight",
+      "name": "primals_88",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.9.feed_forward.w3.weight",
+      "name": "primals_89",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.10.attention.wq.weight",
+      "name": "primals_92",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.10.attention.wk.weight",
+      "name": "primals_93",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.10.attention.wv.weight",
+      "name": "primals_94",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.10.attention.wo.weight",
+      "name": "primals_95",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.10.feed_forward.w1.weight",
+      "name": "primals_96",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.10.feed_forward.w2.weight",
+      "name": "primals_97",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.10.feed_forward.w3.weight",
+      "name": "primals_98",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.11.attention.wq.weight",
+      "name": "primals_101",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.11.attention.wk.weight",
+      "name": "primals_102",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.11.attention.wv.weight",
+      "name": "primals_103",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.11.attention.wo.weight",
+      "name": "primals_104",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.11.feed_forward.w1.weight",
+      "name": "primals_105",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.11.feed_forward.w2.weight",
+      "name": "primals_106",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.11.feed_forward.w3.weight",
+      "name": "primals_107",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.12.attention.wq.weight",
+      "name": "primals_110",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.12.attention.wk.weight",
+      "name": "primals_111",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.12.attention.wv.weight",
+      "name": "primals_112",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.12.attention.wo.weight",
+      "name": "primals_113",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.12.feed_forward.w1.weight",
+      "name": "primals_114",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.12.feed_forward.w2.weight",
+      "name": "primals_115",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.12.feed_forward.w3.weight",
+      "name": "primals_116",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.13.attention.wq.weight",
+      "name": "primals_119",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.13.attention.wk.weight",
+      "name": "primals_120",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.13.attention.wv.weight",
+      "name": "primals_121",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.13.attention.wo.weight",
+      "name": "primals_122",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.13.feed_forward.w1.weight",
+      "name": "primals_123",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.13.feed_forward.w2.weight",
+      "name": "primals_124",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.13.feed_forward.w3.weight",
+      "name": "primals_125",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.14.attention.wq.weight",
+      "name": "primals_128",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.14.attention.wk.weight",
+      "name": "primals_129",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.14.attention.wv.weight",
+      "name": "primals_130",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.14.attention.wo.weight",
+      "name": "primals_131",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.14.feed_forward.w1.weight",
+      "name": "primals_132",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.14.feed_forward.w2.weight",
+      "name": "primals_133",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.14.feed_forward.w3.weight",
+      "name": "primals_134",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.15.attention.wq.weight",
+      "name": "primals_137",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.15.attention.wk.weight",
+      "name": "primals_138",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.15.attention.wv.weight",
+      "name": "primals_139",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.15.attention.wo.weight",
+      "name": "primals_140",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.15.feed_forward.w1.weight",
+      "name": "primals_141",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.15.feed_forward.w2.weight",
+      "name": "primals_142",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.15.feed_forward.w3.weight",
+      "name": "primals_143",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.16.attention.wq.weight",
+      "name": "primals_146",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.16.attention.wk.weight",
+      "name": "primals_147",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.16.attention.wv.weight",
+      "name": "primals_148",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.16.attention.wo.weight",
+      "name": "primals_149",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.16.feed_forward.w1.weight",
+      "name": "primals_150",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.16.feed_forward.w2.weight",
+      "name": "primals_151",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.16.feed_forward.w3.weight",
+      "name": "primals_152",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.17.attention.wq.weight",
+      "name": "primals_155",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.17.attention.wk.weight",
+      "name": "primals_156",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.17.attention.wv.weight",
+      "name": "primals_157",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.17.attention.wo.weight",
+      "name": "primals_158",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.17.feed_forward.w1.weight",
+      "name": "primals_159",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.17.feed_forward.w2.weight",
+      "name": "primals_160",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.17.feed_forward.w3.weight",
+      "name": "primals_161",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.18.attention.wq.weight",
+      "name": "primals_164",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.18.attention.wk.weight",
+      "name": "primals_165",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.18.attention.wv.weight",
+      "name": "primals_166",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.18.attention.wo.weight",
+      "name": "primals_167",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.18.feed_forward.w1.weight",
+      "name": "primals_168",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.18.feed_forward.w2.weight",
+      "name": "primals_169",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.18.feed_forward.w3.weight",
+      "name": "primals_170",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.19.attention.wq.weight",
+      "name": "primals_173",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.19.attention.wk.weight",
+      "name": "primals_174",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.19.attention.wv.weight",
+      "name": "primals_175",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.19.attention.wo.weight",
+      "name": "primals_176",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.19.feed_forward.w1.weight",
+      "name": "primals_177",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.19.feed_forward.w2.weight",
+      "name": "primals_178",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.19.feed_forward.w3.weight",
+      "name": "primals_179",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.20.attention.wq.weight",
+      "name": "primals_182",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.20.attention.wk.weight",
+      "name": "primals_183",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.20.attention.wv.weight",
+      "name": "primals_184",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.20.attention.wo.weight",
+      "name": "primals_185",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.20.feed_forward.w1.weight",
+      "name": "primals_186",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.20.feed_forward.w2.weight",
+      "name": "primals_187",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.20.feed_forward.w3.weight",
+      "name": "primals_188",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.21.attention.wq.weight",
+      "name": "primals_191",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.21.attention.wk.weight",
+      "name": "primals_192",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.21.attention.wv.weight",
+      "name": "primals_193",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.21.attention.wo.weight",
+      "name": "primals_194",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.21.feed_forward.w1.weight",
+      "name": "primals_195",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.21.feed_forward.w2.weight",
+      "name": "primals_196",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.21.feed_forward.w3.weight",
+      "name": "primals_197",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.22.attention.wq.weight",
+      "name": "primals_200",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.22.attention.wk.weight",
+      "name": "primals_201",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.22.attention.wv.weight",
+      "name": "primals_202",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        1024,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.22.attention.wo.weight",
+      "name": "primals_203",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        4096,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.22.feed_forward.w1.weight",
+      "name": "primals_204",
+      "op": "placeholder",
+      "placement": "S(0)S(0)",
+      "shape": [
+        14336,
+        4096
+      ]
+    },
+    {
+      "inputs": [],
+      "module_path": "layers.22.feed_forward.w2.weight",
+      "name": "primals_205",
+      "op": "placeholder",
+      "placement": "S(0)S(1)",
+      "shape": [
+        4096,
+        14336
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/profile_results/real_llama3_3b_dag_node_stats.csv b/profile_results/real_llama3_3b_dag_node_stats.csv
new file mode 100644
index 00000000..5f813f1b
--- /dev/null
+++ b/profile_results/real_llama3_3b_dag_node_stats.csv
@@ -0,0 +1,7200 @@
+idx,name,op,target,phase,layer,direct_dependency_args,direct_dependency_nodes,direct_offspring_nodes,ancestor_count,descendant_count,strategy_count
+0,primals_1,placeholder,primals_1,unknown,,0,0,1,0,5816,3
+1,primals_2,placeholder,primals_2,unknown,,0,0,1,0,5777,3
+2,primals_3,placeholder,primals_3,unknown,,0,0,1,0,5777,3
+3,primals_4,placeholder,primals_4,unknown,,0,0,1,0,5770,3
+4,primals_5,placeholder,primals_5,unknown,,0,0,1,0,5757,3
+5,primals_6,placeholder,primals_6,unknown,,0,0,1,0,5737,3
+6,primals_7,placeholder,primals_7,unknown,,0,0,1,0,5714,3
+7,primals_8,placeholder,primals_8,unknown,,0,0,1,0,5718,3
+8,primals_9,placeholder,primals_9,unknown,,0,0,1,0,5794,2
+9,primals_10,placeholder,primals_10,unknown,,0,0,1,0,5741,2
+10,primals_11,placeholder,primals_11,unknown,,0,0,1,0,5681,3
+11,primals_12,placeholder,primals_12,unknown,,0,0,1,0,5681,3
+12,primals_13,placeholder,primals_13,unknown,,0,0,1,0,5674,3
+13,primals_14,placeholder,primals_14,unknown,,0,0,1,0,5661,3
+14,primals_15,placeholder,primals_15,unknown,,0,0,1,0,5641,3
+15,primals_16,placeholder,primals_16,unknown,,0,0,1,0,5618,3
+16,primals_17,placeholder,primals_17,unknown,,0,0,1,0,5622,3
+17,primals_18,placeholder,primals_18,unknown,,0,0,1,0,5698,2
+18,primals_19,placeholder,primals_19,unknown,,0,0,1,0,5645,2
+19,primals_20,placeholder,primals_20,unknown,,0,0,1,0,5585,3
+20,primals_21,placeholder,primals_21,unknown,,0,0,1,0,5585,3
+21,primals_22,placeholder,primals_22,unknown,,0,0,1,0,5578,3
+22,primals_23,placeholder,primals_23,unknown,,0,0,1,0,5565,3
+23,primals_24,placeholder,primals_24,unknown,,0,0,1,0,5545,3
+24,primals_25,placeholder,primals_25,unknown,,0,0,1,0,5522,3
+25,primals_26,placeholder,primals_26,unknown,,0,0,1,0,5526,3
+26,primals_27,placeholder,primals_27,unknown,,0,0,1,0,5602,2
+27,primals_28,placeholder,primals_28,unknown,,0,0,1,0,5549,2
+28,primals_29,placeholder,primals_29,unknown,,0,0,1,0,5489,3
+29,primals_30,placeholder,primals_30,unknown,,0,0,1,0,5489,3
+30,primals_31,placeholder,primals_31,unknown,,0,0,1,0,5482,3
+31,primals_32,placeholder,primals_32,unknown,,0,0,1,0,5469,3
+32,primals_33,placeholder,primals_33,unknown,,0,0,1,0,5449,3
+33,primals_34,placeholder,primals_34,unknown,,0,0,1,0,5426,3
+34,primals_35,placeholder,primals_35,unknown,,0,0,1,0,5430,3
+35,primals_36,placeholder,primals_36,unknown,,0,0,1,0,5506,2
+36,primals_37,placeholder,primals_37,unknown,,0,0,1,0,5453,2
+37,primals_38,placeholder,primals_38,unknown,,0,0,1,0,5393,3
+38,primals_39,placeholder,primals_39,unknown,,0,0,1,0,5393,3
+39,primals_40,placeholder,primals_40,unknown,,0,0,1,0,5386,3
+40,primals_41,placeholder,primals_41,unknown,,0,0,1,0,5373,3
+41,primals_42,placeholder,primals_42,unknown,,0,0,1,0,5353,3
+42,primals_43,placeholder,primals_43,unknown,,0,0,1,0,5330,3
+43,primals_44,placeholder,primals_44,unknown,,0,0,1,0,5334,3
+44,primals_45,placeholder,primals_45,unknown,,0,0,1,0,5410,2
+45,primals_46,placeholder,primals_46,unknown,,0,0,1,0,5357,2
+46,primals_47,placeholder,primals_47,unknown,,0,0,1,0,5297,3
+47,primals_48,placeholder,primals_48,unknown,,0,0,1,0,5297,3
+48,primals_49,placeholder,primals_49,unknown,,0,0,1,0,5290,3
+49,primals_50,placeholder,primals_50,unknown,,0,0,1,0,5277,3
+50,primals_51,placeholder,primals_51,unknown,,0,0,1,0,5257,3
+51,primals_52,placeholder,primals_52,unknown,,0,0,1,0,5234,3
+52,primals_53,placeholder,primals_53,unknown,,0,0,1,0,5238,3
+53,primals_54,placeholder,primals_54,unknown,,0,0,1,0,5314,2
+54,primals_55,placeholder,primals_55,unknown,,0,0,1,0,5261,2
+55,primals_56,placeholder,primals_56,unknown,,0,0,1,0,5201,3
+56,primals_57,placeholder,primals_57,unknown,,0,0,1,0,5201,3
+57,primals_58,placeholder,primals_58,unknown,,0,0,1,0,5194,3
+58,primals_59,placeholder,primals_59,unknown,,0,0,1,0,5181,3
+59,primals_60,placeholder,primals_60,unknown,,0,0,1,0,5161,3
+60,primals_61,placeholder,primals_61,unknown,,0,0,1,0,5138,3
+61,primals_62,placeholder,primals_62,unknown,,0,0,1,0,5142,3
+62,primals_63,placeholder,primals_63,unknown,,0,0,1,0,5218,2
+63,primals_64,placeholder,primals_64,unknown,,0,0,1,0,5165,2
+64,primals_65,placeholder,primals_65,unknown,,0,0,1,0,5105,3
+65,primals_66,placeholder,primals_66,unknown,,0,0,1,0,5105,3
+66,primals_67,placeholder,primals_67,unknown,,0,0,1,0,5098,3
+67,primals_68,placeholder,primals_68,unknown,,0,0,1,0,5085,3
+68,primals_69,placeholder,primals_69,unknown,,0,0,1,0,5065,3
+69,primals_70,placeholder,primals_70,unknown,,0,0,1,0,5042,3
+70,primals_71,placeholder,primals_71,unknown,,0,0,1,0,5046,3
+71,primals_72,placeholder,primals_72,unknown,,0,0,1,0,5122,2
+72,primals_73,placeholder,primals_73,unknown,,0,0,1,0,5069,2
+73,primals_74,placeholder,primals_74,unknown,,0,0,1,0,5009,3
+74,primals_75,placeholder,primals_75,unknown,,0,0,1,0,5009,3
+75,primals_76,placeholder,primals_76,unknown,,0,0,1,0,5002,3
+76,primals_77,placeholder,primals_77,unknown,,0,0,1,0,4989,3
+77,primals_78,placeholder,primals_78,unknown,,0,0,1,0,4969,3
+78,primals_79,placeholder,primals_79,unknown,,0,0,1,0,4946,3
+79,primals_80,placeholder,primals_80,unknown,,0,0,1,0,4950,3
+80,primals_81,placeholder,primals_81,unknown,,0,0,1,0,5026,2
+81,primals_82,placeholder,primals_82,unknown,,0,0,1,0,4973,2
+82,primals_83,placeholder,primals_83,unknown,,0,0,1,0,4913,3
+83,primals_84,placeholder,primals_84,unknown,,0,0,1,0,4913,3
+84,primals_85,placeholder,primals_85,unknown,,0,0,1,0,4906,3
+85,primals_86,placeholder,primals_86,unknown,,0,0,1,0,4893,3
+86,primals_87,placeholder,primals_87,unknown,,0,0,1,0,4873,3
+87,primals_88,placeholder,primals_88,unknown,,0,0,1,0,4850,3
+88,primals_89,placeholder,primals_89,unknown,,0,0,1,0,4854,3
+89,primals_90,placeholder,primals_90,unknown,,0,0,1,0,4930,2
+90,primals_91,placeholder,primals_91,unknown,,0,0,1,0,4877,2
+91,primals_92,placeholder,primals_92,unknown,,0,0,1,0,4817,3
+92,primals_93,placeholder,primals_93,unknown,,0,0,1,0,4817,3
+93,primals_94,placeholder,primals_94,unknown,,0,0,1,0,4810,3
+94,primals_95,placeholder,primals_95,unknown,,0,0,1,0,4797,3
+95,primals_96,placeholder,primals_96,unknown,,0,0,1,0,4777,3
+96,primals_97,placeholder,primals_97,unknown,,0,0,1,0,4754,3
+97,primals_98,placeholder,primals_98,unknown,,0,0,1,0,4758,3
+98,primals_99,placeholder,primals_99,unknown,,0,0,1,0,4834,2
+99,primals_100,placeholder,primals_100,unknown,,0,0,1,0,4781,2
+100,primals_101,placeholder,primals_101,unknown,,0,0,1,0,4721,3
+101,primals_102,placeholder,primals_102,unknown,,0,0,1,0,4721,3
+102,primals_103,placeholder,primals_103,unknown,,0,0,1,0,4714,3
+103,primals_104,placeholder,primals_104,unknown,,0,0,1,0,4701,3
+104,primals_105,placeholder,primals_105,unknown,,0,0,1,0,4681,3
+105,primals_106,placeholder,primals_106,unknown,,0,0,1,0,4658,3
+106,primals_107,placeholder,primals_107,unknown,,0,0,1,0,4662,3
+107,primals_108,placeholder,primals_108,unknown,,0,0,1,0,4738,2
+108,primals_109,placeholder,primals_109,unknown,,0,0,1,0,4685,2
+109,primals_110,placeholder,primals_110,unknown,,0,0,1,0,4625,3
+110,primals_111,placeholder,primals_111,unknown,,0,0,1,0,4625,3
+111,primals_112,placeholder,primals_112,unknown,,0,0,1,0,4618,3
+112,primals_113,placeholder,primals_113,unknown,,0,0,1,0,4605,3
+113,primals_114,placeholder,primals_114,unknown,,0,0,1,0,4585,3
+114,primals_115,placeholder,primals_115,unknown,,0,0,1,0,4562,3
+115,primals_116,placeholder,primals_116,unknown,,0,0,1,0,4566,3
+116,primals_117,placeholder,primals_117,unknown,,0,0,1,0,4642,2
+117,primals_118,placeholder,primals_118,unknown,,0,0,1,0,4589,2
+118,primals_119,placeholder,primals_119,unknown,,0,0,1,0,4529,3
+119,primals_120,placeholder,primals_120,unknown,,0,0,1,0,4529,3
+120,primals_121,placeholder,primals_121,unknown,,0,0,1,0,4522,3
+121,primals_122,placeholder,primals_122,unknown,,0,0,1,0,4509,3
+122,primals_123,placeholder,primals_123,unknown,,0,0,1,0,4489,3
+123,primals_124,placeholder,primals_124,unknown,,0,0,1,0,4466,3
+124,primals_125,placeholder,primals_125,unknown,,0,0,1,0,4470,3
+125,primals_126,placeholder,primals_126,unknown,,0,0,1,0,4546,2
+126,primals_127,placeholder,primals_127,unknown,,0,0,1,0,4493,2
+127,primals_128,placeholder,primals_128,unknown,,0,0,1,0,4433,3
+128,primals_129,placeholder,primals_129,unknown,,0,0,1,0,4433,3
+129,primals_130,placeholder,primals_130,unknown,,0,0,1,0,4426,3
+130,primals_131,placeholder,primals_131,unknown,,0,0,1,0,4413,3
+131,primals_132,placeholder,primals_132,unknown,,0,0,1,0,4393,3
+132,primals_133,placeholder,primals_133,unknown,,0,0,1,0,4370,3
+133,primals_134,placeholder,primals_134,unknown,,0,0,1,0,4374,3
+134,primals_135,placeholder,primals_135,unknown,,0,0,1,0,4450,2
+135,primals_136,placeholder,primals_136,unknown,,0,0,1,0,4397,2
+136,primals_137,placeholder,primals_137,unknown,,0,0,1,0,4337,3
+137,primals_138,placeholder,primals_138,unknown,,0,0,1,0,4337,3
+138,primals_139,placeholder,primals_139,unknown,,0,0,1,0,4330,3
+139,primals_140,placeholder,primals_140,unknown,,0,0,1,0,4317,3
+140,primals_141,placeholder,primals_141,unknown,,0,0,1,0,4297,3
+141,primals_142,placeholder,primals_142,unknown,,0,0,1,0,4274,3
+142,primals_143,placeholder,primals_143,unknown,,0,0,1,0,4278,3
+143,primals_144,placeholder,primals_144,unknown,,0,0,1,0,4354,2
+144,primals_145,placeholder,primals_145,unknown,,0,0,1,0,4301,2
+145,primals_146,placeholder,primals_146,unknown,,0,0,1,0,4241,3
+146,primals_147,placeholder,primals_147,unknown,,0,0,1,0,4241,3
+147,primals_148,placeholder,primals_148,unknown,,0,0,1,0,4234,3
+148,primals_149,placeholder,primals_149,unknown,,0,0,1,0,4221,3
+149,primals_150,placeholder,primals_150,unknown,,0,0,1,0,4201,3
+150,primals_151,placeholder,primals_151,unknown,,0,0,1,0,4178,3
+151,primals_152,placeholder,primals_152,unknown,,0,0,1,0,4182,3
+152,primals_153,placeholder,primals_153,unknown,,0,0,1,0,4258,2
+153,primals_154,placeholder,primals_154,unknown,,0,0,1,0,4205,2
+154,primals_155,placeholder,primals_155,unknown,,0,0,1,0,4145,3
+155,primals_156,placeholder,primals_156,unknown,,0,0,1,0,4145,3
+156,primals_157,placeholder,primals_157,unknown,,0,0,1,0,4138,3
+157,primals_158,placeholder,primals_158,unknown,,0,0,1,0,4125,3
+158,primals_159,placeholder,primals_159,unknown,,0,0,1,0,4105,3
+159,primals_160,placeholder,primals_160,unknown,,0,0,1,0,4082,3
+160,primals_161,placeholder,primals_161,unknown,,0,0,1,0,4086,3
+161,primals_162,placeholder,primals_162,unknown,,0,0,1,0,4162,2
+162,primals_163,placeholder,primals_163,unknown,,0,0,1,0,4109,2
+163,primals_164,placeholder,primals_164,unknown,,0,0,1,0,4049,3
+164,primals_165,placeholder,primals_165,unknown,,0,0,1,0,4049,3
+165,primals_166,placeholder,primals_166,unknown,,0,0,1,0,4042,3
+166,primals_167,placeholder,primals_167,unknown,,0,0,1,0,4029,3
+167,primals_168,placeholder,primals_168,unknown,,0,0,1,0,4009,3
+168,primals_169,placeholder,primals_169,unknown,,0,0,1,0,3986,3
+169,primals_170,placeholder,primals_170,unknown,,0,0,1,0,3990,3
+170,primals_171,placeholder,primals_171,unknown,,0,0,1,0,4066,2
+171,primals_172,placeholder,primals_172,unknown,,0,0,1,0,4013,2
+172,primals_173,placeholder,primals_173,unknown,,0,0,1,0,3953,3
+173,primals_174,placeholder,primals_174,unknown,,0,0,1,0,3953,3
+174,primals_175,placeholder,primals_175,unknown,,0,0,1,0,3946,3
+175,primals_176,placeholder,primals_176,unknown,,0,0,1,0,3933,3
+176,primals_177,placeholder,primals_177,unknown,,0,0,1,0,3913,3
+177,primals_178,placeholder,primals_178,unknown,,0,0,1,0,3890,3
+178,primals_179,placeholder,primals_179,unknown,,0,0,1,0,3894,3
+179,primals_180,placeholder,primals_180,unknown,,0,0,1,0,3970,2
+180,primals_181,placeholder,primals_181,unknown,,0,0,1,0,3917,2
+181,primals_182,placeholder,primals_182,unknown,,0,0,1,0,3857,3
+182,primals_183,placeholder,primals_183,unknown,,0,0,1,0,3857,3
+183,primals_184,placeholder,primals_184,unknown,,0,0,1,0,3850,3
+184,primals_185,placeholder,primals_185,unknown,,0,0,1,0,3837,3
+185,primals_186,placeholder,primals_186,unknown,,0,0,1,0,3817,3
+186,primals_187,placeholder,primals_187,unknown,,0,0,1,0,3794,3
+187,primals_188,placeholder,primals_188,unknown,,0,0,1,0,3798,3
+188,primals_189,placeholder,primals_189,unknown,,0,0,1,0,3874,2
+189,primals_190,placeholder,primals_190,unknown,,0,0,1,0,3821,2
+190,primals_191,placeholder,primals_191,unknown,,0,0,1,0,3761,3
+191,primals_192,placeholder,primals_192,unknown,,0,0,1,0,3761,3
+192,primals_193,placeholder,primals_193,unknown,,0,0,1,0,3754,3
+193,primals_194,placeholder,primals_194,unknown,,0,0,1,0,3741,3
+194,primals_195,placeholder,primals_195,unknown,,0,0,1,0,3721,3
+195,primals_196,placeholder,primals_196,unknown,,0,0,1,0,3698,3
+196,primals_197,placeholder,primals_197,unknown,,0,0,1,0,3702,3
+197,primals_198,placeholder,primals_198,unknown,,0,0,1,0,3778,2
+198,primals_199,placeholder,primals_199,unknown,,0,0,1,0,3725,2
+199,primals_200,placeholder,primals_200,unknown,,0,0,1,0,3665,3
+200,primals_201,placeholder,primals_201,unknown,,0,0,1,0,3665,3
+201,primals_202,placeholder,primals_202,unknown,,0,0,1,0,3658,3
+202,primals_203,placeholder,primals_203,unknown,,0,0,1,0,3645,3
+203,primals_204,placeholder,primals_204,unknown,,0,0,1,0,3625,3
+204,primals_205,placeholder,primals_205,unknown,,0,0,1,0,3602,3
+205,primals_206,placeholder,primals_206,unknown,,0,0,1,0,3606,3
+206,primals_207,placeholder,primals_207,unknown,,0,0,1,0,3682,2
+207,primals_208,placeholder,primals_208,unknown,,0,0,1,0,3629,2
+208,primals_209,placeholder,primals_209,unknown,,0,0,1,0,3569,3
+209,primals_210,placeholder,primals_210,unknown,,0,0,1,0,3569,3
+210,primals_211,placeholder,primals_211,unknown,,0,0,1,0,3562,3
+211,primals_212,placeholder,primals_212,unknown,,0,0,1,0,3549,3
+212,primals_213,placeholder,primals_213,unknown,,0,0,1,0,3529,3
+213,primals_214,placeholder,primals_214,unknown,,0,0,1,0,3506,3
+214,primals_215,placeholder,primals_215,unknown,,0,0,1,0,3510,3
+215,primals_216,placeholder,primals_216,unknown,,0,0,1,0,3586,2
+216,primals_217,placeholder,primals_217,unknown,,0,0,1,0,3533,2
+217,primals_218,placeholder,primals_218,unknown,,0,0,1,0,3473,3
+218,primals_219,placeholder,primals_219,unknown,,0,0,1,0,3473,3
+219,primals_220,placeholder,primals_220,unknown,,0,0,1,0,3466,3
+220,primals_221,placeholder,primals_221,unknown,,0,0,1,0,3453,3
+221,primals_222,placeholder,primals_222,unknown,,0,0,1,0,3433,3
+222,primals_223,placeholder,primals_223,unknown,,0,0,1,0,3410,3
+223,primals_224,placeholder,primals_224,unknown,,0,0,1,0,3414,3
+224,primals_225,placeholder,primals_225,unknown,,0,0,1,0,3490,2
+225,primals_226,placeholder,primals_226,unknown,,0,0,1,0,3437,2
+226,primals_227,placeholder,primals_227,unknown,,0,0,1,0,3377,3
+227,primals_228,placeholder,primals_228,unknown,,0,0,1,0,3377,3
+228,primals_229,placeholder,primals_229,unknown,,0,0,1,0,3370,3
+229,primals_230,placeholder,primals_230,unknown,,0,0,1,0,3357,3
+230,primals_231,placeholder,primals_231,unknown,,0,0,1,0,3337,3
+231,primals_232,placeholder,primals_232,unknown,,0,0,1,0,3314,3
+232,primals_233,placeholder,primals_233,unknown,,0,0,1,0,3318,3
+233,primals_234,placeholder,primals_234,unknown,,0,0,1,0,3394,2
+234,primals_235,placeholder,primals_235,unknown,,0,0,1,0,3341,2
+235,primals_236,placeholder,primals_236,unknown,,0,0,1,0,3281,3
+236,primals_237,placeholder,primals_237,unknown,,0,0,1,0,3281,3
+237,primals_238,placeholder,primals_238,unknown,,0,0,1,0,3274,3
+238,primals_239,placeholder,primals_239,unknown,,0,0,1,0,3261,3
+239,primals_240,placeholder,primals_240,unknown,,0,0,1,0,3241,3
+240,primals_241,placeholder,primals_241,unknown,,0,0,1,0,3218,3
+241,primals_242,placeholder,primals_242,unknown,,0,0,1,0,3222,3
+242,primals_243,placeholder,primals_243,unknown,,0,0,1,0,3298,2
+243,primals_244,placeholder,primals_244,unknown,,0,0,1,0,3245,2
+244,primals_245,placeholder,primals_245,unknown,,0,0,1,0,3185,3
+245,primals_246,placeholder,primals_246,unknown,,0,0,1,0,3185,3
+246,primals_247,placeholder,primals_247,unknown,,0,0,1,0,3178,3
+247,primals_248,placeholder,primals_248,unknown,,0,0,1,0,3165,3
+248,primals_249,placeholder,primals_249,unknown,,0,0,1,0,3145,3
+249,primals_250,placeholder,primals_250,unknown,,0,0,1,0,3122,3
+250,primals_251,placeholder,primals_251,unknown,,0,0,1,0,3126,3
+251,primals_252,placeholder,primals_252,unknown,,0,0,1,0,3202,2
+252,primals_253,placeholder,primals_253,unknown,,0,0,1,0,3149,2
+253,primals_254,placeholder,primals_254,unknown,,0,0,1,0,3103,2
+254,primals_255,placeholder,primals_255,unknown,,0,0,1,0,5943,3
+255,primals_256,placeholder,primals_256,unknown,,0,0,1,0,5806,3
+256,tangents_1,placeholder,tangents_1,backward,,0,0,1,0,3104,4
+257,alias_default,call_function,alias.default,unknown,,1,1,2,1,5815,3
+258,dtype_cast,call_function,dtype_cast.default,forward,,1,1,1,2,5805,3
+259,alias_default_2,call_function,alias.default,unknown,,1,1,2,1,5805,3
+260,embedding,call_function,embedding.default,forward,,2,2,1,5,5804,5
+261,dtype_cast_1,call_function,dtype_cast.default,forward,0,1,1,1,1,5793,2
+262,alias_default_4,call_function,alias.default,forward,,1,1,3,6,5803,4
+263,convert_element_type,call_function,convert_element_type.default,forward,0,1,1,1,7,5801,4
+264,alias_default_6,call_function,alias.default,forward,0,1,1,2,8,5800,4
+265,pow_1,call_function,pow.Tensor_Scalar,forward,0,1,1,1,9,5799,4
+266,mean,call_function,mean.dim,forward,0,1,1,1,10,5798,4
+267,add,call_function,add.Scalar,forward,0,1,1,1,11,5797,3
+268,rsqrt,call_function,rsqrt.default,forward,0,1,1,1,12,5796,3
+269,alias_default_7,call_function,alias.default,forward,0,1,1,3,13,5795,3
+270,mul,call_function,mul.Tensor,forward,0,2,2,1,14,5791,8
+271,alias_default_5,call_function,alias.default,forward,0,1,1,2,2,5792,2
+272,mul_1,call_function,mul.Tensor,forward,0,2,2,1,18,5790,8
+273,convert_element_type_1,call_function,convert_element_type.default,forward,0,1,1,1,19,5789,6
+274,dtype_cast_2,call_function,dtype_cast.default,forward,0,1,1,1,1,5776,3
+275,permute,call_function,permute.default,forward,0,1,1,1,2,5775,3
+276,alias_default_8,call_function,alias.default,forward,0,1,1,6,20,5788,4
+277,alias_default_9,call_function,alias.default,forward,0,1,1,2,3,5774,3
+278,einsum_default,call_function,einsum.default,forward,0,2,2,1,25,5772,5
+279,dtype_cast_3,call_function,dtype_cast.default,forward,0,1,1,1,1,5776,3
+280,permute_1,call_function,permute.default,forward,0,1,1,1,2,5775,3
+281,alias_default_10,call_function,alias.default,forward,0,1,1,2,3,5774,3
+282,einsum_default_1,call_function,einsum.default,forward,0,2,2,1,25,5772,5
+283,dtype_cast_4,call_function,dtype_cast.default,forward,0,1,1,1,1,5769,3
+284,permute_2,call_function,permute.default,forward,0,1,1,1,2,5768,3
+285,alias_default_11,call_function,alias.default,forward,0,1,1,2,3,5767,3
+286,einsum_default_2,call_function,einsum.default,forward,0,2,2,1,25,5765,5
+287,view_6,call_function,view.default,forward,0,1,1,1,26,5771,4
+288,view_7,call_function,view.default,forward,0,1,1,1,26,5771,4
+289,view_8,call_function,view.default,forward,0,1,1,1,26,5764,4
+290,convert_element_type_8,call_function,convert_element_type.default,forward,0,1,1,1,27,5770,4
+291,view_9,call_function,view.default,forward,0,1,1,1,28,5769,4
+292,view_as_complex,call_function,view_as_complex.default,forward,0,1,1,1,29,5768,6
+293,convert_element_type_9,call_function,convert_element_type.default,forward,0,1,1,1,27,5770,4
+294,view_10,call_function,view.default,forward,0,1,1,1,28,5769,4
+295,view_as_complex_1,call_function,view_as_complex.default,forward,0,1,1,1,29,5768,6
+296,alias_default_1,call_function,alias.default,unknown,,1,1,28,1,5942,3
+297,view_11,call_function,view.default,forward,0,1,1,1,2,5779,3
+298,alias_default_12,call_function,alias.default,forward,0,1,1,4,3,5778,3
+299,mul_2,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8
+300,view_as_real,call_function,view_as_real.default,forward,0,1,1,1,35,5766,6
+301,view_12,call_function,view.default,forward,0,1,1,1,36,5765,6
+302,mul_3,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8
+303,view_as_real_1,call_function,view_as_real.default,forward,0,1,1,1,35,5766,6
+304,view_13,call_function,view.default,forward,0,1,1,1,36,5765,6
+305,convert_element_type_10,call_function,convert_element_type.default,forward,0,1,1,1,37,5764,6
+306,convert_element_type_11,call_function,convert_element_type.default,forward,0,1,1,1,37,5764,6
+307,permute_3,call_function,permute.default,forward,0,1,1,1,38,5763,6
+308,permute_4,call_function,permute.default,forward,0,1,1,1,38,5763,6
+309,permute_5,call_function,permute.default,forward,0,1,1,1,27,5763,4
+310,alias_default_13,call_function,alias.default,forward,0,1,1,2,39,5762,4
+311,alias_default_14,call_function,alias.default,forward,0,1,1,2,39,5762,4
+312,alias_default_15,call_function,alias.default,forward,0,1,1,2,28,5762,4
+313,_scaled_dot_product_flash_attention,call_function,_scaled_dot_product_flash_attention.default,forward,0,3,3,4,63,5761,2
+314,getitem,call_function,getitem,forward,0,1,1,1,64,5757,2
+315,getitem_1,call_function,getitem,forward,0,1,1,1,64,64,2
+316,getitem_6,call_function,getitem,forward,0,1,1,1,64,64,1
+317,getitem_7,call_function,getitem,forward,0,1,1,1,64,64,1
+318,alias_default_16,call_function,alias.default,forward,0,1,1,2,65,5756,4
+319,permute_6,call_function,permute.default,forward,0,1,1,1,66,5755,4
+320,view_14,call_function,view.default,forward,0,1,1,1,67,5754,3
+321,dtype_cast_5,call_function,dtype_cast.default,forward,0,1,1,1,1,5756,3
+322,permute_7,call_function,permute.default,forward,0,1,1,1,2,5755,3
+323,alias_default_17,call_function,alias.default,forward,0,1,1,2,68,5753,4
+324,alias_default_18,call_function,alias.default,forward,0,1,1,2,3,5754,3
+325,einsum_default_3,call_function,einsum.default,forward,0,2,2,1,73,5752,5
+326,add_1,call_function,add.Tensor,forward,0,2,2,1,74,5751,10
+327,dtype_cast_6,call_function,dtype_cast.default,forward,0,1,1,1,1,5740,2
+328,alias_default_19,call_function,alias.default,forward,0,1,1,3,75,5750,4
+329,convert_element_type_14,call_function,convert_element_type.default,forward,0,1,1,1,76,5748,4
+330,alias_default_21,call_function,alias.default,forward,0,1,1,2,77,5747,4
+331,pow_2,call_function,pow.Tensor_Scalar,forward,0,1,1,1,78,5746,4
+332,mean_1,call_function,mean.dim,forward,0,1,1,1,79,5745,4
+333,add_2,call_function,add.Scalar,forward,0,1,1,1,80,5744,3
+334,rsqrt_1,call_function,rsqrt.default,forward,0,1,1,1,81,5743,3
+335,alias_default_22,call_function,alias.default,forward,0,1,1,3,82,5742,3
+336,mul_4,call_function,mul.Tensor,forward,0,2,2,1,83,5738,8
+337,alias_default_20,call_function,alias.default,forward,0,1,1,2,2,5739,2
+338,mul_5,call_function,mul.Tensor,forward,0,2,2,1,87,5737,8
+339,convert_element_type_15,call_function,convert_element_type.default,forward,0,1,1,1,88,5736,6
+340,dtype_cast_7,call_function,dtype_cast.default,forward,0,1,1,1,1,5736,3
+341,permute_8,call_function,permute.default,forward,0,1,1,1,2,5735,3
+342,alias_default_23,call_function,alias.default,forward,0,1,1,4,89,5735,4
+343,alias_default_24,call_function,alias.default,forward,0,1,1,2,3,5734,3
+344,einsum_default_4,call_function,einsum.default,forward,0,2,2,1,94,5732,5
+345,alias_default_25,call_function,alias.default,forward,0,1,1,2,95,5731,4
+346,convert_element_type_18,call_function,convert_element_type.default,forward,0,1,1,1,96,5719,4
+347,alias_default_26,call_function,alias.default,forward,0,1,1,2,97,5718,4
+348,neg,call_function,neg.default,forward,0,1,1,1,98,5717,8
+349,exp,call_function,exp.default,forward,0,1,1,1,99,5716,6
+350,add_3,call_function,add.Tensor,forward,0,1,1,1,100,5715,4
+351,div,call_function,div.Tensor,forward,0,2,2,1,101,5714,6
+352,convert_element_type_19,call_function,convert_element_type.default,forward,0,1,1,1,102,5713,6
+353,dtype_cast_8,call_function,dtype_cast.default,forward,0,1,1,1,1,5717,3
+354,permute_9,call_function,permute.default,forward,0,1,1,1,2,5716,3
+355,alias_default_28,call_function,alias.default,forward,0,1,1,2,3,5715,3
+356,einsum_default_5,call_function,einsum.default,forward,0,2,2,1,94,5713,5
+357,alias_default_27,call_function,alias.default,forward,0,1,1,2,103,5712,4
+358,alias_default_29,call_function,alias.default,forward,0,1,1,2,95,5712,4
+359,mul_6,call_function,mul.Tensor,forward,0,2,2,1,110,5711,8
+360,dtype_cast_9,call_function,dtype_cast.default,forward,0,1,1,1,1,5713,3
+361,permute_10,call_function,permute.default,forward,0,1,1,1,2,5712,3
+362,alias_default_30,call_function,alias.default,forward,0,1,1,2,111,5710,4
+363,alias_default_31,call_function,alias.default,forward,0,1,1,2,3,5711,3
+364,einsum_default_6,call_function,einsum.default,forward,0,2,2,1,116,5709,5
+365,add_4,call_function,add.Tensor,forward,0,2,2,1,117,5708,10
+366,dtype_cast_10,call_function,dtype_cast.default,forward,1,1,1,1,1,5697,2
+367,alias_default_32,call_function,alias.default,forward,0,1,1,3,118,5707,4
+368,convert_element_type_24,call_function,convert_element_type.default,forward,1,1,1,1,119,5705,4
+369,alias_default_34,call_function,alias.default,forward,1,1,1,2,120,5704,4
+370,pow_3,call_function,pow.Tensor_Scalar,forward,1,1,1,1,121,5703,4
+371,mean_2,call_function,mean.dim,forward,1,1,1,1,122,5702,4
+372,add_5,call_function,add.Scalar,forward,1,1,1,1,123,5701,3
+373,rsqrt_2,call_function,rsqrt.default,forward,1,1,1,1,124,5700,3
+374,alias_default_35,call_function,alias.default,forward,1,1,1,3,125,5699,3
+375,mul_7,call_function,mul.Tensor,forward,1,2,2,1,126,5695,8
+376,alias_default_33,call_function,alias.default,forward,1,1,1,2,2,5696,2
+377,mul_8,call_function,mul.Tensor,forward,1,2,2,1,130,5694,8
+378,convert_element_type_25,call_function,convert_element_type.default,forward,1,1,1,1,131,5693,6
+379,dtype_cast_11,call_function,dtype_cast.default,forward,1,1,1,1,1,5680,3
+380,permute_11,call_function,permute.default,forward,1,1,1,1,2,5679,3
+381,alias_default_36,call_function,alias.default,forward,1,1,1,6,132,5692,4
+382,alias_default_37,call_function,alias.default,forward,1,1,1,2,3,5678,3
+383,einsum_default_7,call_function,einsum.default,forward,1,2,2,1,137,5676,5
+384,dtype_cast_12,call_function,dtype_cast.default,forward,1,1,1,1,1,5680,3
+385,permute_12,call_function,permute.default,forward,1,1,1,1,2,5679,3
+386,alias_default_38,call_function,alias.default,forward,1,1,1,2,3,5678,3
+387,einsum_default_8,call_function,einsum.default,forward,1,2,2,1,137,5676,5
+388,dtype_cast_13,call_function,dtype_cast.default,forward,1,1,1,1,1,5673,3
+389,permute_13,call_function,permute.default,forward,1,1,1,1,2,5672,3
+390,alias_default_39,call_function,alias.default,forward,1,1,1,2,3,5671,3
+391,einsum_default_9,call_function,einsum.default,forward,1,2,2,1,137,5669,5
+392,view_29,call_function,view.default,forward,1,1,1,1,138,5675,4
+393,view_30,call_function,view.default,forward,1,1,1,1,138,5675,4
+394,view_31,call_function,view.default,forward,1,1,1,1,138,5668,4
+395,convert_element_type_32,call_function,convert_element_type.default,forward,1,1,1,1,139,5674,4
+396,view_32,call_function,view.default,forward,1,1,1,1,140,5673,4
+397,view_as_complex_2,call_function,view_as_complex.default,forward,1,1,1,1,141,5672,6
+398,convert_element_type_33,call_function,convert_element_type.default,forward,1,1,1,1,139,5674,4
+399,view_33,call_function,view.default,forward,1,1,1,1,140,5673,4
+400,view_as_complex_3,call_function,view_as_complex.default,forward,1,1,1,1,141,5672,6
+401,view_34,call_function,view.default,forward,1,1,1,1,2,5683,3
+402,alias_default_40,call_function,alias.default,forward,1,1,1,4,3,5682,3
+403,mul_9,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8
+404,view_as_real_2,call_function,view_as_real.default,forward,1,1,1,1,145,5670,6
+405,view_35,call_function,view.default,forward,1,1,1,1,146,5669,6
+406,mul_10,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8
+407,view_as_real_3,call_function,view_as_real.default,forward,1,1,1,1,145,5670,6
+408,view_36,call_function,view.default,forward,1,1,1,1,146,5669,6
+409,convert_element_type_34,call_function,convert_element_type.default,forward,1,1,1,1,147,5668,6
+410,convert_element_type_35,call_function,convert_element_type.default,forward,1,1,1,1,147,5668,6
+411,permute_14,call_function,permute.default,forward,1,1,1,1,148,5667,6
+412,permute_15,call_function,permute.default,forward,1,1,1,1,148,5667,6
+413,permute_16,call_function,permute.default,forward,1,1,1,1,139,5667,4
+414,alias_default_41,call_function,alias.default,forward,1,1,1,2,149,5666,4
+415,alias_default_42,call_function,alias.default,forward,1,1,1,2,149,5666,4
+416,alias_default_43,call_function,alias.default,forward,1,1,1,2,140,5666,4
+417,_scaled_dot_product_flash_attention_1,call_function,_scaled_dot_product_flash_attention.default,forward,1,3,3,4,173,5665,2
+418,getitem_9,call_function,getitem,forward,1,1,1,1,174,5661,2
+419,getitem_10,call_function,getitem,forward,1,1,1,1,174,174,2
+420,getitem_15,call_function,getitem,forward,1,1,1,1,174,174,1
+421,getitem_16,call_function,getitem,forward,1,1,1,1,174,174,1
+422,alias_default_44,call_function,alias.default,forward,1,1,1,2,175,5660,4
+423,permute_17,call_function,permute.default,forward,1,1,1,1,176,5659,4
+424,view_37,call_function,view.default,forward,1,1,1,1,177,5658,3
+425,dtype_cast_14,call_function,dtype_cast.default,forward,1,1,1,1,1,5660,3
+426,permute_18,call_function,permute.default,forward,1,1,1,1,2,5659,3
+427,alias_default_45,call_function,alias.default,forward,1,1,1,2,178,5657,4
+428,alias_default_46,call_function,alias.default,forward,1,1,1,2,3,5658,3
+429,einsum_default_10,call_function,einsum.default,forward,1,2,2,1,183,5656,5
+430,add_6,call_function,add.Tensor,forward,1,2,2,1,184,5655,10
+431,dtype_cast_15,call_function,dtype_cast.default,forward,1,1,1,1,1,5644,2
+432,alias_default_47,call_function,alias.default,forward,1,1,1,3,185,5654,4
+433,convert_element_type_38,call_function,convert_element_type.default,forward,1,1,1,1,186,5652,4
+434,alias_default_49,call_function,alias.default,forward,1,1,1,2,187,5651,4
+435,pow_4,call_function,pow.Tensor_Scalar,forward,1,1,1,1,188,5650,4
+436,mean_3,call_function,mean.dim,forward,1,1,1,1,189,5649,4
+437,add_7,call_function,add.Scalar,forward,1,1,1,1,190,5648,3
+438,rsqrt_3,call_function,rsqrt.default,forward,1,1,1,1,191,5647,3
+439,alias_default_50,call_function,alias.default,forward,1,1,1,3,192,5646,3
+440,mul_11,call_function,mul.Tensor,forward,1,2,2,1,193,5642,8
+441,alias_default_48,call_function,alias.default,forward,1,1,1,2,2,5643,2
+442,mul_12,call_function,mul.Tensor,forward,1,2,2,1,197,5641,8
+443,convert_element_type_39,call_function,convert_element_type.default,forward,1,1,1,1,198,5640,6
+444,dtype_cast_16,call_function,dtype_cast.default,forward,1,1,1,1,1,5640,3
+445,permute_19,call_function,permute.default,forward,1,1,1,1,2,5639,3
+446,alias_default_51,call_function,alias.default,forward,1,1,1,4,199,5639,4
+447,alias_default_52,call_function,alias.default,forward,1,1,1,2,3,5638,3
+448,einsum_default_11,call_function,einsum.default,forward,1,2,2,1,204,5636,5
+449,alias_default_53,call_function,alias.default,forward,1,1,1,2,205,5635,4
+450,convert_element_type_42,call_function,convert_element_type.default,forward,1,1,1,1,206,5623,4
+451,alias_default_54,call_function,alias.default,forward,1,1,1,2,207,5622,4
+452,neg_1,call_function,neg.default,forward,1,1,1,1,208,5621,8
+453,exp_1,call_function,exp.default,forward,1,1,1,1,209,5620,6
+454,add_8,call_function,add.Tensor,forward,1,1,1,1,210,5619,4
+455,div_1,call_function,div.Tensor,forward,1,2,2,1,211,5618,6
+456,convert_element_type_43,call_function,convert_element_type.default,forward,1,1,1,1,212,5617,6
+457,dtype_cast_17,call_function,dtype_cast.default,forward,1,1,1,1,1,5621,3
+458,permute_20,call_function,permute.default,forward,1,1,1,1,2,5620,3
+459,alias_default_56,call_function,alias.default,forward,1,1,1,2,3,5619,3
+460,einsum_default_12,call_function,einsum.default,forward,1,2,2,1,204,5617,5
+461,alias_default_55,call_function,alias.default,forward,1,1,1,2,213,5616,4
+462,alias_default_57,call_function,alias.default,forward,1,1,1,2,205,5616,4
+463,mul_13,call_function,mul.Tensor,forward,1,2,2,1,220,5615,8
+464,dtype_cast_18,call_function,dtype_cast.default,forward,1,1,1,1,1,5617,3
+465,permute_21,call_function,permute.default,forward,1,1,1,1,2,5616,3
+466,alias_default_58,call_function,alias.default,forward,1,1,1,2,221,5614,4
+467,alias_default_59,call_function,alias.default,forward,1,1,1,2,3,5615,3
+468,einsum_default_13,call_function,einsum.default,forward,1,2,2,1,226,5613,5
+469,add_9,call_function,add.Tensor,forward,1,2,2,1,227,5612,10
+470,dtype_cast_19,call_function,dtype_cast.default,forward,2,1,1,1,1,5601,2
+471,alias_default_60,call_function,alias.default,forward,1,1,1,3,228,5611,4
+472,convert_element_type_48,call_function,convert_element_type.default,forward,2,1,1,1,229,5609,4
+473,alias_default_62,call_function,alias.default,forward,2,1,1,2,230,5608,4
+474,pow_5,call_function,pow.Tensor_Scalar,forward,2,1,1,1,231,5607,4
+475,mean_4,call_function,mean.dim,forward,2,1,1,1,232,5606,4
+476,add_10,call_function,add.Scalar,forward,2,1,1,1,233,5605,3
+477,rsqrt_4,call_function,rsqrt.default,forward,2,1,1,1,234,5604,3
+478,alias_default_63,call_function,alias.default,forward,2,1,1,3,235,5603,3
+479,mul_14,call_function,mul.Tensor,forward,2,2,2,1,236,5599,8
+480,alias_default_61,call_function,alias.default,forward,2,1,1,2,2,5600,2
+481,mul_15,call_function,mul.Tensor,forward,2,2,2,1,240,5598,8
+482,convert_element_type_49,call_function,convert_element_type.default,forward,2,1,1,1,241,5597,6
+483,dtype_cast_20,call_function,dtype_cast.default,forward,2,1,1,1,1,5584,3
+484,permute_22,call_function,permute.default,forward,2,1,1,1,2,5583,3
+485,alias_default_64,call_function,alias.default,forward,2,1,1,6,242,5596,4
+486,alias_default_65,call_function,alias.default,forward,2,1,1,2,3,5582,3
+487,einsum_default_14,call_function,einsum.default,forward,2,2,2,1,247,5580,5
+488,dtype_cast_21,call_function,dtype_cast.default,forward,2,1,1,1,1,5584,3
+489,permute_23,call_function,permute.default,forward,2,1,1,1,2,5583,3
+490,alias_default_66,call_function,alias.default,forward,2,1,1,2,3,5582,3
+491,einsum_default_15,call_function,einsum.default,forward,2,2,2,1,247,5580,5
+492,dtype_cast_22,call_function,dtype_cast.default,forward,2,1,1,1,1,5577,3
+493,permute_24,call_function,permute.default,forward,2,1,1,1,2,5576,3
+494,alias_default_67,call_function,alias.default,forward,2,1,1,2,3,5575,3
+495,einsum_default_16,call_function,einsum.default,forward,2,2,2,1,247,5573,5
+496,view_52,call_function,view.default,forward,2,1,1,1,248,5579,4
+497,view_53,call_function,view.default,forward,2,1,1,1,248,5579,4
+498,view_54,call_function,view.default,forward,2,1,1,1,248,5572,4
+499,convert_element_type_56,call_function,convert_element_type.default,forward,2,1,1,1,249,5578,4
+500,view_55,call_function,view.default,forward,2,1,1,1,250,5577,4
+501,view_as_complex_4,call_function,view_as_complex.default,forward,2,1,1,1,251,5576,6
+502,convert_element_type_57,call_function,convert_element_type.default,forward,2,1,1,1,249,5578,4
+503,view_56,call_function,view.default,forward,2,1,1,1,250,5577,4
+504,view_as_complex_5,call_function,view_as_complex.default,forward,2,1,1,1,251,5576,6
+505,view_57,call_function,view.default,forward,2,1,1,1,2,5587,3
+506,alias_default_68,call_function,alias.default,forward,2,1,1,4,3,5586,3
+507,mul_16,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8
+508,view_as_real_4,call_function,view_as_real.default,forward,2,1,1,1,255,5574,6
+509,view_58,call_function,view.default,forward,2,1,1,1,256,5573,6
+510,mul_17,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8
+511,view_as_real_5,call_function,view_as_real.default,forward,2,1,1,1,255,5574,6
+512,view_59,call_function,view.default,forward,2,1,1,1,256,5573,6
+513,convert_element_type_58,call_function,convert_element_type.default,forward,2,1,1,1,257,5572,6
+514,convert_element_type_59,call_function,convert_element_type.default,forward,2,1,1,1,257,5572,6
+515,permute_25,call_function,permute.default,forward,2,1,1,1,258,5571,6
+516,permute_26,call_function,permute.default,forward,2,1,1,1,258,5571,6
+517,permute_27,call_function,permute.default,forward,2,1,1,1,249,5571,4
+518,alias_default_69,call_function,alias.default,forward,2,1,1,2,259,5570,4
+519,alias_default_70,call_function,alias.default,forward,2,1,1,2,259,5570,4
+520,alias_default_71,call_function,alias.default,forward,2,1,1,2,250,5570,4
+521,_scaled_dot_product_flash_attention_2,call_function,_scaled_dot_product_flash_attention.default,forward,2,3,3,4,283,5569,2
+522,getitem_18,call_function,getitem,forward,2,1,1,1,284,5565,2
+523,getitem_19,call_function,getitem,forward,2,1,1,1,284,284,2
+524,getitem_24,call_function,getitem,forward,2,1,1,1,284,284,1
+525,getitem_25,call_function,getitem,forward,2,1,1,1,284,284,1
+526,alias_default_72,call_function,alias.default,forward,2,1,1,2,285,5564,4
+527,permute_28,call_function,permute.default,forward,2,1,1,1,286,5563,4
+528,view_60,call_function,view.default,forward,2,1,1,1,287,5562,3
+529,dtype_cast_23,call_function,dtype_cast.default,forward,2,1,1,1,1,5564,3
+530,permute_29,call_function,permute.default,forward,2,1,1,1,2,5563,3
+531,alias_default_73,call_function,alias.default,forward,2,1,1,2,288,5561,4
+532,alias_default_74,call_function,alias.default,forward,2,1,1,2,3,5562,3
+533,einsum_default_17,call_function,einsum.default,forward,2,2,2,1,293,5560,5
+534,add_11,call_function,add.Tensor,forward,2,2,2,1,294,5559,10
+535,dtype_cast_24,call_function,dtype_cast.default,forward,2,1,1,1,1,5548,2
+536,alias_default_75,call_function,alias.default,forward,2,1,1,3,295,5558,4
+537,convert_element_type_62,call_function,convert_element_type.default,forward,2,1,1,1,296,5556,4
+538,alias_default_77,call_function,alias.default,forward,2,1,1,2,297,5555,4
+539,pow_6,call_function,pow.Tensor_Scalar,forward,2,1,1,1,298,5554,4
+540,mean_5,call_function,mean.dim,forward,2,1,1,1,299,5553,4
+541,add_12,call_function,add.Scalar,forward,2,1,1,1,300,5552,3
+542,rsqrt_5,call_function,rsqrt.default,forward,2,1,1,1,301,5551,3
+543,alias_default_78,call_function,alias.default,forward,2,1,1,3,302,5550,3
+544,mul_18,call_function,mul.Tensor,forward,2,2,2,1,303,5546,8
+545,alias_default_76,call_function,alias.default,forward,2,1,1,2,2,5547,2
+546,mul_19,call_function,mul.Tensor,forward,2,2,2,1,307,5545,8
+547,convert_element_type_63,call_function,convert_element_type.default,forward,2,1,1,1,308,5544,6
+548,dtype_cast_25,call_function,dtype_cast.default,forward,2,1,1,1,1,5544,3
+549,permute_30,call_function,permute.default,forward,2,1,1,1,2,5543,3
+550,alias_default_79,call_function,alias.default,forward,2,1,1,4,309,5543,4
+551,alias_default_80,call_function,alias.default,forward,2,1,1,2,3,5542,3
+552,einsum_default_18,call_function,einsum.default,forward,2,2,2,1,314,5540,5
+553,alias_default_81,call_function,alias.default,forward,2,1,1,2,315,5539,4
+554,convert_element_type_66,call_function,convert_element_type.default,forward,2,1,1,1,316,5527,4
+555,alias_default_82,call_function,alias.default,forward,2,1,1,2,317,5526,4
+556,neg_2,call_function,neg.default,forward,2,1,1,1,318,5525,8
+557,exp_2,call_function,exp.default,forward,2,1,1,1,319,5524,6
+558,add_13,call_function,add.Tensor,forward,2,1,1,1,320,5523,4
+559,div_2,call_function,div.Tensor,forward,2,2,2,1,321,5522,6
+560,convert_element_type_67,call_function,convert_element_type.default,forward,2,1,1,1,322,5521,6
+561,dtype_cast_26,call_function,dtype_cast.default,forward,2,1,1,1,1,5525,3
+562,permute_31,call_function,permute.default,forward,2,1,1,1,2,5524,3
+563,alias_default_84,call_function,alias.default,forward,2,1,1,2,3,5523,3
+564,einsum_default_19,call_function,einsum.default,forward,2,2,2,1,314,5521,5
+565,alias_default_83,call_function,alias.default,forward,2,1,1,2,323,5520,4
+566,alias_default_85,call_function,alias.default,forward,2,1,1,2,315,5520,4
+567,mul_20,call_function,mul.Tensor,forward,2,2,2,1,330,5519,8
+568,dtype_cast_27,call_function,dtype_cast.default,forward,2,1,1,1,1,5521,3
+569,permute_32,call_function,permute.default,forward,2,1,1,1,2,5520,3
+570,alias_default_86,call_function,alias.default,forward,2,1,1,2,331,5518,4
+571,alias_default_87,call_function,alias.default,forward,2,1,1,2,3,5519,3
+572,einsum_default_20,call_function,einsum.default,forward,2,2,2,1,336,5517,5
+573,add_14,call_function,add.Tensor,forward,2,2,2,1,337,5516,10
+574,dtype_cast_28,call_function,dtype_cast.default,forward,3,1,1,1,1,5505,2
+575,alias_default_88,call_function,alias.default,forward,2,1,1,3,338,5515,4
+576,convert_element_type_72,call_function,convert_element_type.default,forward,3,1,1,1,339,5513,4
+577,alias_default_90,call_function,alias.default,forward,3,1,1,2,340,5512,4
+578,pow_7,call_function,pow.Tensor_Scalar,forward,3,1,1,1,341,5511,4
+579,mean_6,call_function,mean.dim,forward,3,1,1,1,342,5510,4
+580,add_15,call_function,add.Scalar,forward,3,1,1,1,343,5509,3
+581,rsqrt_6,call_function,rsqrt.default,forward,3,1,1,1,344,5508,3
+582,alias_default_91,call_function,alias.default,forward,3,1,1,3,345,5507,3
+583,mul_21,call_function,mul.Tensor,forward,3,2,2,1,346,5503,8
+584,alias_default_89,call_function,alias.default,forward,3,1,1,2,2,5504,2
+585,mul_22,call_function,mul.Tensor,forward,3,2,2,1,350,5502,8
+586,convert_element_type_73,call_function,convert_element_type.default,forward,3,1,1,1,351,5501,6
+587,dtype_cast_29,call_function,dtype_cast.default,forward,3,1,1,1,1,5488,3
+588,permute_33,call_function,permute.default,forward,3,1,1,1,2,5487,3
+589,alias_default_92,call_function,alias.default,forward,3,1,1,6,352,5500,4
+590,alias_default_93,call_function,alias.default,forward,3,1,1,2,3,5486,3
+591,einsum_default_21,call_function,einsum.default,forward,3,2,2,1,357,5484,5
+592,dtype_cast_30,call_function,dtype_cast.default,forward,3,1,1,1,1,5488,3
+593,permute_34,call_function,permute.default,forward,3,1,1,1,2,5487,3
+594,alias_default_94,call_function,alias.default,forward,3,1,1,2,3,5486,3
+595,einsum_default_22,call_function,einsum.default,forward,3,2,2,1,357,5484,5
+596,dtype_cast_31,call_function,dtype_cast.default,forward,3,1,1,1,1,5481,3
+597,permute_35,call_function,permute.default,forward,3,1,1,1,2,5480,3
+598,alias_default_95,call_function,alias.default,forward,3,1,1,2,3,5479,3
+599,einsum_default_23,call_function,einsum.default,forward,3,2,2,1,357,5477,5
+600,view_75,call_function,view.default,forward,3,1,1,1,358,5483,4
+601,view_76,call_function,view.default,forward,3,1,1,1,358,5483,4
+602,view_77,call_function,view.default,forward,3,1,1,1,358,5476,4
+603,convert_element_type_80,call_function,convert_element_type.default,forward,3,1,1,1,359,5482,4
+604,view_78,call_function,view.default,forward,3,1,1,1,360,5481,4
+605,view_as_complex_6,call_function,view_as_complex.default,forward,3,1,1,1,361,5480,6
+606,convert_element_type_81,call_function,convert_element_type.default,forward,3,1,1,1,359,5482,4
+607,view_79,call_function,view.default,forward,3,1,1,1,360,5481,4
+608,view_as_complex_7,call_function,view_as_complex.default,forward,3,1,1,1,361,5480,6
+609,view_80,call_function,view.default,forward,3,1,1,1,2,5491,3
+610,alias_default_96,call_function,alias.default,forward,3,1,1,4,3,5490,3
+611,mul_23,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8
+612,view_as_real_6,call_function,view_as_real.default,forward,3,1,1,1,365,5478,6
+613,view_81,call_function,view.default,forward,3,1,1,1,366,5477,6
+614,mul_24,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8
+615,view_as_real_7,call_function,view_as_real.default,forward,3,1,1,1,365,5478,6
+616,view_82,call_function,view.default,forward,3,1,1,1,366,5477,6
+617,convert_element_type_82,call_function,convert_element_type.default,forward,3,1,1,1,367,5476,6
+618,convert_element_type_83,call_function,convert_element_type.default,forward,3,1,1,1,367,5476,6
+619,permute_36,call_function,permute.default,forward,3,1,1,1,368,5475,6
+620,permute_37,call_function,permute.default,forward,3,1,1,1,368,5475,6
+621,permute_38,call_function,permute.default,forward,3,1,1,1,359,5475,4
+622,alias_default_97,call_function,alias.default,forward,3,1,1,2,369,5474,4
+623,alias_default_98,call_function,alias.default,forward,3,1,1,2,369,5474,4
+624,alias_default_99,call_function,alias.default,forward,3,1,1,2,360,5474,4
+625,_scaled_dot_product_flash_attention_3,call_function,_scaled_dot_product_flash_attention.default,forward,3,3,3,4,393,5473,2
+626,getitem_27,call_function,getitem,forward,3,1,1,1,394,5469,2
+627,getitem_28,call_function,getitem,forward,3,1,1,1,394,394,2
+628,getitem_33,call_function,getitem,forward,3,1,1,1,394,394,1
+629,getitem_34,call_function,getitem,forward,3,1,1,1,394,394,1
+630,alias_default_100,call_function,alias.default,forward,3,1,1,2,395,5468,4
+631,permute_39,call_function,permute.default,forward,3,1,1,1,396,5467,4
+632,view_83,call_function,view.default,forward,3,1,1,1,397,5466,3
+633,dtype_cast_32,call_function,dtype_cast.default,forward,3,1,1,1,1,5468,3
+634,permute_40,call_function,permute.default,forward,3,1,1,1,2,5467,3
+635,alias_default_101,call_function,alias.default,forward,3,1,1,2,398,5465,4
+636,alias_default_102,call_function,alias.default,forward,3,1,1,2,3,5466,3
+637,einsum_default_24,call_function,einsum.default,forward,3,2,2,1,403,5464,5
+638,add_16,call_function,add.Tensor,forward,3,2,2,1,404,5463,10
+639,dtype_cast_33,call_function,dtype_cast.default,forward,3,1,1,1,1,5452,2
+640,alias_default_103,call_function,alias.default,forward,3,1,1,3,405,5462,4
+641,convert_element_type_86,call_function,convert_element_type.default,forward,3,1,1,1,406,5460,4
+642,alias_default_105,call_function,alias.default,forward,3,1,1,2,407,5459,4
+643,pow_8,call_function,pow.Tensor_Scalar,forward,3,1,1,1,408,5458,4
+644,mean_7,call_function,mean.dim,forward,3,1,1,1,409,5457,4
+645,add_17,call_function,add.Scalar,forward,3,1,1,1,410,5456,3
+646,rsqrt_7,call_function,rsqrt.default,forward,3,1,1,1,411,5455,3
+647,alias_default_106,call_function,alias.default,forward,3,1,1,3,412,5454,3
+648,mul_25,call_function,mul.Tensor,forward,3,2,2,1,413,5450,8
+649,alias_default_104,call_function,alias.default,forward,3,1,1,2,2,5451,2
+650,mul_26,call_function,mul.Tensor,forward,3,2,2,1,417,5449,8
+651,convert_element_type_87,call_function,convert_element_type.default,forward,3,1,1,1,418,5448,6
+652,dtype_cast_34,call_function,dtype_cast.default,forward,3,1,1,1,1,5448,3
+653,permute_41,call_function,permute.default,forward,3,1,1,1,2,5447,3
+654,alias_default_107,call_function,alias.default,forward,3,1,1,4,419,5447,4
+655,alias_default_108,call_function,alias.default,forward,3,1,1,2,3,5446,3
+656,einsum_default_25,call_function,einsum.default,forward,3,2,2,1,424,5444,5
+657,alias_default_109,call_function,alias.default,forward,3,1,1,2,425,5443,4
+658,convert_element_type_90,call_function,convert_element_type.default,forward,3,1,1,1,426,5431,4
+659,alias_default_110,call_function,alias.default,forward,3,1,1,2,427,5430,4
+660,neg_3,call_function,neg.default,forward,3,1,1,1,428,5429,8
+661,exp_3,call_function,exp.default,forward,3,1,1,1,429,5428,6
+662,add_18,call_function,add.Tensor,forward,3,1,1,1,430,5427,4
+663,div_3,call_function,div.Tensor,forward,3,2,2,1,431,5426,6
+664,convert_element_type_91,call_function,convert_element_type.default,forward,3,1,1,1,432,5425,6
+665,dtype_cast_35,call_function,dtype_cast.default,forward,3,1,1,1,1,5429,3
+666,permute_42,call_function,permute.default,forward,3,1,1,1,2,5428,3
+667,alias_default_112,call_function,alias.default,forward,3,1,1,2,3,5427,3
+668,einsum_default_26,call_function,einsum.default,forward,3,2,2,1,424,5425,5
+669,alias_default_111,call_function,alias.default,forward,3,1,1,2,433,5424,4
+670,alias_default_113,call_function,alias.default,forward,3,1,1,2,425,5424,4
+671,mul_27,call_function,mul.Tensor,forward,3,2,2,1,440,5423,8
+672,dtype_cast_36,call_function,dtype_cast.default,forward,3,1,1,1,1,5425,3
+673,permute_43,call_function,permute.default,forward,3,1,1,1,2,5424,3
+674,alias_default_114,call_function,alias.default,forward,3,1,1,2,441,5422,4
+675,alias_default_115,call_function,alias.default,forward,3,1,1,2,3,5423,3
+676,einsum_default_27,call_function,einsum.default,forward,3,2,2,1,446,5421,5
+677,add_19,call_function,add.Tensor,forward,3,2,2,1,447,5420,10
+678,dtype_cast_37,call_function,dtype_cast.default,forward,4,1,1,1,1,5409,2
+679,alias_default_116,call_function,alias.default,forward,3,1,1,3,448,5419,4
+680,convert_element_type_96,call_function,convert_element_type.default,forward,4,1,1,1,449,5417,4
+681,alias_default_118,call_function,alias.default,forward,4,1,1,2,450,5416,4
+682,pow_9,call_function,pow.Tensor_Scalar,forward,4,1,1,1,451,5415,4
+683,mean_8,call_function,mean.dim,forward,4,1,1,1,452,5414,4
+684,add_20,call_function,add.Scalar,forward,4,1,1,1,453,5413,3
+685,rsqrt_8,call_function,rsqrt.default,forward,4,1,1,1,454,5412,3
+686,alias_default_119,call_function,alias.default,forward,4,1,1,3,455,5411,3
+687,mul_28,call_function,mul.Tensor,forward,4,2,2,1,456,5407,8
+688,alias_default_117,call_function,alias.default,forward,4,1,1,2,2,5408,2
+689,mul_29,call_function,mul.Tensor,forward,4,2,2,1,460,5406,8
+690,convert_element_type_97,call_function,convert_element_type.default,forward,4,1,1,1,461,5405,6
+691,dtype_cast_38,call_function,dtype_cast.default,forward,4,1,1,1,1,5392,3
+692,permute_44,call_function,permute.default,forward,4,1,1,1,2,5391,3
+693,alias_default_120,call_function,alias.default,forward,4,1,1,6,462,5404,4
+694,alias_default_121,call_function,alias.default,forward,4,1,1,2,3,5390,3
+695,einsum_default_28,call_function,einsum.default,forward,4,2,2,1,467,5388,5
+696,dtype_cast_39,call_function,dtype_cast.default,forward,4,1,1,1,1,5392,3
+697,permute_45,call_function,permute.default,forward,4,1,1,1,2,5391,3
+698,alias_default_122,call_function,alias.default,forward,4,1,1,2,3,5390,3
+699,einsum_default_29,call_function,einsum.default,forward,4,2,2,1,467,5388,5
+700,dtype_cast_40,call_function,dtype_cast.default,forward,4,1,1,1,1,5385,3
+701,permute_46,call_function,permute.default,forward,4,1,1,1,2,5384,3
+702,alias_default_123,call_function,alias.default,forward,4,1,1,2,3,5383,3
+703,einsum_default_30,call_function,einsum.default,forward,4,2,2,1,467,5381,5
+704,view_98,call_function,view.default,forward,4,1,1,1,468,5387,4
+705,view_99,call_function,view.default,forward,4,1,1,1,468,5387,4
+706,view_100,call_function,view.default,forward,4,1,1,1,468,5380,4
+707,convert_element_type_104,call_function,convert_element_type.default,forward,4,1,1,1,469,5386,4
+708,view_101,call_function,view.default,forward,4,1,1,1,470,5385,4
+709,view_as_complex_8,call_function,view_as_complex.default,forward,4,1,1,1,471,5384,6
+710,convert_element_type_105,call_function,convert_element_type.default,forward,4,1,1,1,469,5386,4
+711,view_102,call_function,view.default,forward,4,1,1,1,470,5385,4
+712,view_as_complex_9,call_function,view_as_complex.default,forward,4,1,1,1,471,5384,6
+713,view_103,call_function,view.default,forward,4,1,1,1,2,5395,3
+714,alias_default_124,call_function,alias.default,forward,4,1,1,4,3,5394,3
+715,mul_30,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8
+716,view_as_real_8,call_function,view_as_real.default,forward,4,1,1,1,475,5382,6
+717,view_104,call_function,view.default,forward,4,1,1,1,476,5381,6
+718,mul_31,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8
+719,view_as_real_9,call_function,view_as_real.default,forward,4,1,1,1,475,5382,6
+720,view_105,call_function,view.default,forward,4,1,1,1,476,5381,6
+721,convert_element_type_106,call_function,convert_element_type.default,forward,4,1,1,1,477,5380,6
+722,convert_element_type_107,call_function,convert_element_type.default,forward,4,1,1,1,477,5380,6
+723,permute_47,call_function,permute.default,forward,4,1,1,1,478,5379,6
+724,permute_48,call_function,permute.default,forward,4,1,1,1,478,5379,6
+725,permute_49,call_function,permute.default,forward,4,1,1,1,469,5379,4
+726,alias_default_125,call_function,alias.default,forward,4,1,1,2,479,5378,4
+727,alias_default_126,call_function,alias.default,forward,4,1,1,2,479,5378,4
+728,alias_default_127,call_function,alias.default,forward,4,1,1,2,470,5378,4
+729,_scaled_dot_product_flash_attention_4,call_function,_scaled_dot_product_flash_attention.default,forward,4,3,3,4,503,5377,2
+730,getitem_36,call_function,getitem,forward,4,1,1,1,504,5373,2
+731,getitem_37,call_function,getitem,forward,4,1,1,1,504,504,2
+732,getitem_42,call_function,getitem,forward,4,1,1,1,504,504,1
+733,getitem_43,call_function,getitem,forward,4,1,1,1,504,504,1
+734,alias_default_128,call_function,alias.default,forward,4,1,1,2,505,5372,4
+735,permute_50,call_function,permute.default,forward,4,1,1,1,506,5371,4
+736,view_106,call_function,view.default,forward,4,1,1,1,507,5370,3
+737,dtype_cast_41,call_function,dtype_cast.default,forward,4,1,1,1,1,5372,3
+738,permute_51,call_function,permute.default,forward,4,1,1,1,2,5371,3
+739,alias_default_129,call_function,alias.default,forward,4,1,1,2,508,5369,4
+740,alias_default_130,call_function,alias.default,forward,4,1,1,2,3,5370,3
+741,einsum_default_31,call_function,einsum.default,forward,4,2,2,1,513,5368,5
+742,add_21,call_function,add.Tensor,forward,4,2,2,1,514,5367,10
+743,dtype_cast_42,call_function,dtype_cast.default,forward,4,1,1,1,1,5356,2
+744,alias_default_131,call_function,alias.default,forward,4,1,1,3,515,5366,4
+745,convert_element_type_110,call_function,convert_element_type.default,forward,4,1,1,1,516,5364,4
+746,alias_default_133,call_function,alias.default,forward,4,1,1,2,517,5363,4
+747,pow_10,call_function,pow.Tensor_Scalar,forward,4,1,1,1,518,5362,4
+748,mean_9,call_function,mean.dim,forward,4,1,1,1,519,5361,4
+749,add_22,call_function,add.Scalar,forward,4,1,1,1,520,5360,3
+750,rsqrt_9,call_function,rsqrt.default,forward,4,1,1,1,521,5359,3
+751,alias_default_134,call_function,alias.default,forward,4,1,1,3,522,5358,3
+752,mul_32,call_function,mul.Tensor,forward,4,2,2,1,523,5354,8
+753,alias_default_132,call_function,alias.default,forward,4,1,1,2,2,5355,2
+754,mul_33,call_function,mul.Tensor,forward,4,2,2,1,527,5353,8
+755,convert_element_type_111,call_function,convert_element_type.default,forward,4,1,1,1,528,5352,6
+756,dtype_cast_43,call_function,dtype_cast.default,forward,4,1,1,1,1,5352,3
+757,permute_52,call_function,permute.default,forward,4,1,1,1,2,5351,3
+758,alias_default_135,call_function,alias.default,forward,4,1,1,4,529,5351,4
+759,alias_default_136,call_function,alias.default,forward,4,1,1,2,3,5350,3
+760,einsum_default_32,call_function,einsum.default,forward,4,2,2,1,534,5348,5
+761,alias_default_137,call_function,alias.default,forward,4,1,1,2,535,5347,4
+762,convert_element_type_114,call_function,convert_element_type.default,forward,4,1,1,1,536,5335,4
+763,alias_default_138,call_function,alias.default,forward,4,1,1,2,537,5334,4
+764,neg_4,call_function,neg.default,forward,4,1,1,1,538,5333,8
+765,exp_4,call_function,exp.default,forward,4,1,1,1,539,5332,6
+766,add_23,call_function,add.Tensor,forward,4,1,1,1,540,5331,4
+767,div_4,call_function,div.Tensor,forward,4,2,2,1,541,5330,6
+768,convert_element_type_115,call_function,convert_element_type.default,forward,4,1,1,1,542,5329,6
+769,dtype_cast_44,call_function,dtype_cast.default,forward,4,1,1,1,1,5333,3
+770,permute_53,call_function,permute.default,forward,4,1,1,1,2,5332,3
+771,alias_default_140,call_function,alias.default,forward,4,1,1,2,3,5331,3
+772,einsum_default_33,call_function,einsum.default,forward,4,2,2,1,534,5329,5
+773,alias_default_139,call_function,alias.default,forward,4,1,1,2,543,5328,4
+774,alias_default_141,call_function,alias.default,forward,4,1,1,2,535,5328,4
+775,mul_34,call_function,mul.Tensor,forward,4,2,2,1,550,5327,8
+776,dtype_cast_45,call_function,dtype_cast.default,forward,4,1,1,1,1,5329,3
+777,permute_54,call_function,permute.default,forward,4,1,1,1,2,5328,3
+778,alias_default_142,call_function,alias.default,forward,4,1,1,2,551,5326,4
+779,alias_default_143,call_function,alias.default,forward,4,1,1,2,3,5327,3
+780,einsum_default_34,call_function,einsum.default,forward,4,2,2,1,556,5325,5
+781,add_24,call_function,add.Tensor,forward,4,2,2,1,557,5324,10
+782,dtype_cast_46,call_function,dtype_cast.default,forward,5,1,1,1,1,5313,2
+783,alias_default_144,call_function,alias.default,forward,4,1,1,3,558,5323,4
+784,convert_element_type_120,call_function,convert_element_type.default,forward,5,1,1,1,559,5321,4
+785,alias_default_146,call_function,alias.default,forward,5,1,1,2,560,5320,4
+786,pow_11,call_function,pow.Tensor_Scalar,forward,5,1,1,1,561,5319,4
+787,mean_10,call_function,mean.dim,forward,5,1,1,1,562,5318,4
+788,add_25,call_function,add.Scalar,forward,5,1,1,1,563,5317,3
+789,rsqrt_10,call_function,rsqrt.default,forward,5,1,1,1,564,5316,3
+790,alias_default_147,call_function,alias.default,forward,5,1,1,3,565,5315,3
+791,mul_35,call_function,mul.Tensor,forward,5,2,2,1,566,5311,8
+792,alias_default_145,call_function,alias.default,forward,5,1,1,2,2,5312,2
+793,mul_36,call_function,mul.Tensor,forward,5,2,2,1,570,5310,8
+794,convert_element_type_121,call_function,convert_element_type.default,forward,5,1,1,1,571,5309,6
+795,dtype_cast_47,call_function,dtype_cast.default,forward,5,1,1,1,1,5296,3
+796,permute_55,call_function,permute.default,forward,5,1,1,1,2,5295,3
+797,alias_default_148,call_function,alias.default,forward,5,1,1,6,572,5308,4
+798,alias_default_149,call_function,alias.default,forward,5,1,1,2,3,5294,3
+799,einsum_default_35,call_function,einsum.default,forward,5,2,2,1,577,5292,5
+800,dtype_cast_48,call_function,dtype_cast.default,forward,5,1,1,1,1,5296,3
+801,permute_56,call_function,permute.default,forward,5,1,1,1,2,5295,3
+802,alias_default_150,call_function,alias.default,forward,5,1,1,2,3,5294,3
+803,einsum_default_36,call_function,einsum.default,forward,5,2,2,1,577,5292,5
+804,dtype_cast_49,call_function,dtype_cast.default,forward,5,1,1,1,1,5289,3
+805,permute_57,call_function,permute.default,forward,5,1,1,1,2,5288,3
+806,alias_default_151,call_function,alias.default,forward,5,1,1,2,3,5287,3
+807,einsum_default_37,call_function,einsum.default,forward,5,2,2,1,577,5285,5
+808,view_121,call_function,view.default,forward,5,1,1,1,578,5291,4
+809,view_122,call_function,view.default,forward,5,1,1,1,578,5291,4
+810,view_123,call_function,view.default,forward,5,1,1,1,578,5284,4
+811,convert_element_type_128,call_function,convert_element_type.default,forward,5,1,1,1,579,5290,4
+812,view_124,call_function,view.default,forward,5,1,1,1,580,5289,4
+813,view_as_complex_10,call_function,view_as_complex.default,forward,5,1,1,1,581,5288,6
+814,convert_element_type_129,call_function,convert_element_type.default,forward,5,1,1,1,579,5290,4
+815,view_125,call_function,view.default,forward,5,1,1,1,580,5289,4
+816,view_as_complex_11,call_function,view_as_complex.default,forward,5,1,1,1,581,5288,6
+817,view_126,call_function,view.default,forward,5,1,1,1,2,5299,3
+818,alias_default_152,call_function,alias.default,forward,5,1,1,4,3,5298,3
+819,mul_37,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8
+820,view_as_real_10,call_function,view_as_real.default,forward,5,1,1,1,585,5286,6
+821,view_127,call_function,view.default,forward,5,1,1,1,586,5285,6
+822,mul_38,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8
+823,view_as_real_11,call_function,view_as_real.default,forward,5,1,1,1,585,5286,6
+824,view_128,call_function,view.default,forward,5,1,1,1,586,5285,6
+825,convert_element_type_130,call_function,convert_element_type.default,forward,5,1,1,1,587,5284,6
+826,convert_element_type_131,call_function,convert_element_type.default,forward,5,1,1,1,587,5284,6
+827,permute_58,call_function,permute.default,forward,5,1,1,1,588,5283,6
+828,permute_59,call_function,permute.default,forward,5,1,1,1,588,5283,6
+829,permute_60,call_function,permute.default,forward,5,1,1,1,579,5283,4
+830,alias_default_153,call_function,alias.default,forward,5,1,1,2,589,5282,4
+831,alias_default_154,call_function,alias.default,forward,5,1,1,2,589,5282,4
+832,alias_default_155,call_function,alias.default,forward,5,1,1,2,580,5282,4
+833,_scaled_dot_product_flash_attention_5,call_function,_scaled_dot_product_flash_attention.default,forward,5,3,3,4,613,5281,2
+834,getitem_45,call_function,getitem,forward,5,1,1,1,614,5277,2
+835,getitem_46,call_function,getitem,forward,5,1,1,1,614,614,2
+836,getitem_51,call_function,getitem,forward,5,1,1,1,614,614,1
+837,getitem_52,call_function,getitem,forward,5,1,1,1,614,614,1
+838,alias_default_156,call_function,alias.default,forward,5,1,1,2,615,5276,4
+839,permute_61,call_function,permute.default,forward,5,1,1,1,616,5275,4
+840,view_129,call_function,view.default,forward,5,1,1,1,617,5274,3
+841,dtype_cast_50,call_function,dtype_cast.default,forward,5,1,1,1,1,5276,3
+842,permute_62,call_function,permute.default,forward,5,1,1,1,2,5275,3
+843,alias_default_157,call_function,alias.default,forward,5,1,1,2,618,5273,4
+844,alias_default_158,call_function,alias.default,forward,5,1,1,2,3,5274,3
+845,einsum_default_38,call_function,einsum.default,forward,5,2,2,1,623,5272,5
+846,add_26,call_function,add.Tensor,forward,5,2,2,1,624,5271,10
+847,dtype_cast_51,call_function,dtype_cast.default,forward,5,1,1,1,1,5260,2
+848,alias_default_159,call_function,alias.default,forward,5,1,1,3,625,5270,4
+849,convert_element_type_134,call_function,convert_element_type.default,forward,5,1,1,1,626,5268,4
+850,alias_default_161,call_function,alias.default,forward,5,1,1,2,627,5267,4
+851,pow_12,call_function,pow.Tensor_Scalar,forward,5,1,1,1,628,5266,4
+852,mean_11,call_function,mean.dim,forward,5,1,1,1,629,5265,4
+853,add_27,call_function,add.Scalar,forward,5,1,1,1,630,5264,3
+854,rsqrt_11,call_function,rsqrt.default,forward,5,1,1,1,631,5263,3
+855,alias_default_162,call_function,alias.default,forward,5,1,1,3,632,5262,3
+856,mul_39,call_function,mul.Tensor,forward,5,2,2,1,633,5258,8
+857,alias_default_160,call_function,alias.default,forward,5,1,1,2,2,5259,2
+858,mul_40,call_function,mul.Tensor,forward,5,2,2,1,637,5257,8
+859,convert_element_type_135,call_function,convert_element_type.default,forward,5,1,1,1,638,5256,6
+860,dtype_cast_52,call_function,dtype_cast.default,forward,5,1,1,1,1,5256,3
+861,permute_63,call_function,permute.default,forward,5,1,1,1,2,5255,3
+862,alias_default_163,call_function,alias.default,forward,5,1,1,4,639,5255,4
+863,alias_default_164,call_function,alias.default,forward,5,1,1,2,3,5254,3
+864,einsum_default_39,call_function,einsum.default,forward,5,2,2,1,644,5252,5
+865,alias_default_165,call_function,alias.default,forward,5,1,1,2,645,5251,4
+866,convert_element_type_138,call_function,convert_element_type.default,forward,5,1,1,1,646,5239,4
+867,alias_default_166,call_function,alias.default,forward,5,1,1,2,647,5238,4
+868,neg_5,call_function,neg.default,forward,5,1,1,1,648,5237,8
+869,exp_5,call_function,exp.default,forward,5,1,1,1,649,5236,6
+870,add_28,call_function,add.Tensor,forward,5,1,1,1,650,5235,4
+871,div_5,call_function,div.Tensor,forward,5,2,2,1,651,5234,6
+872,convert_element_type_139,call_function,convert_element_type.default,forward,5,1,1,1,652,5233,6
+873,dtype_cast_53,call_function,dtype_cast.default,forward,5,1,1,1,1,5237,3
+874,permute_64,call_function,permute.default,forward,5,1,1,1,2,5236,3
+875,alias_default_168,call_function,alias.default,forward,5,1,1,2,3,5235,3
+876,einsum_default_40,call_function,einsum.default,forward,5,2,2,1,644,5233,5
+877,alias_default_167,call_function,alias.default,forward,5,1,1,2,653,5232,4
+878,alias_default_169,call_function,alias.default,forward,5,1,1,2,645,5232,4
+879,mul_41,call_function,mul.Tensor,forward,5,2,2,1,660,5231,8
+880,dtype_cast_54,call_function,dtype_cast.default,forward,5,1,1,1,1,5233,3
+881,permute_65,call_function,permute.default,forward,5,1,1,1,2,5232,3
+882,alias_default_170,call_function,alias.default,forward,5,1,1,2,661,5230,4
+883,alias_default_171,call_function,alias.default,forward,5,1,1,2,3,5231,3
+884,einsum_default_41,call_function,einsum.default,forward,5,2,2,1,666,5229,5
+885,add_29,call_function,add.Tensor,forward,5,2,2,1,667,5228,10
+886,dtype_cast_55,call_function,dtype_cast.default,forward,6,1,1,1,1,5217,2
+887,alias_default_172,call_function,alias.default,forward,5,1,1,3,668,5227,4
+888,convert_element_type_144,call_function,convert_element_type.default,forward,6,1,1,1,669,5225,4
+889,alias_default_174,call_function,alias.default,forward,6,1,1,2,670,5224,4
+890,pow_13,call_function,pow.Tensor_Scalar,forward,6,1,1,1,671,5223,4
+891,mean_12,call_function,mean.dim,forward,6,1,1,1,672,5222,4
+892,add_30,call_function,add.Scalar,forward,6,1,1,1,673,5221,3
+893,rsqrt_12,call_function,rsqrt.default,forward,6,1,1,1,674,5220,3
+894,alias_default_175,call_function,alias.default,forward,6,1,1,3,675,5219,3
+895,mul_42,call_function,mul.Tensor,forward,6,2,2,1,676,5215,8
+896,alias_default_173,call_function,alias.default,forward,6,1,1,2,2,5216,2
+897,mul_43,call_function,mul.Tensor,forward,6,2,2,1,680,5214,8
+898,convert_element_type_145,call_function,convert_element_type.default,forward,6,1,1,1,681,5213,6
+899,dtype_cast_56,call_function,dtype_cast.default,forward,6,1,1,1,1,5200,3
+900,permute_66,call_function,permute.default,forward,6,1,1,1,2,5199,3
+901,alias_default_176,call_function,alias.default,forward,6,1,1,6,682,5212,4
+902,alias_default_177,call_function,alias.default,forward,6,1,1,2,3,5198,3
+903,einsum_default_42,call_function,einsum.default,forward,6,2,2,1,687,5196,5
+904,dtype_cast_57,call_function,dtype_cast.default,forward,6,1,1,1,1,5200,3
+905,permute_67,call_function,permute.default,forward,6,1,1,1,2,5199,3
+906,alias_default_178,call_function,alias.default,forward,6,1,1,2,3,5198,3
+907,einsum_default_43,call_function,einsum.default,forward,6,2,2,1,687,5196,5
+908,dtype_cast_58,call_function,dtype_cast.default,forward,6,1,1,1,1,5193,3
+909,permute_68,call_function,permute.default,forward,6,1,1,1,2,5192,3
+910,alias_default_179,call_function,alias.default,forward,6,1,1,2,3,5191,3
+911,einsum_default_44,call_function,einsum.default,forward,6,2,2,1,687,5189,5
+912,view_144,call_function,view.default,forward,6,1,1,1,688,5195,4
+913,view_145,call_function,view.default,forward,6,1,1,1,688,5195,4
+914,view_146,call_function,view.default,forward,6,1,1,1,688,5188,4
+915,convert_element_type_152,call_function,convert_element_type.default,forward,6,1,1,1,689,5194,4
+916,view_147,call_function,view.default,forward,6,1,1,1,690,5193,4
+917,view_as_complex_12,call_function,view_as_complex.default,forward,6,1,1,1,691,5192,6
+918,convert_element_type_153,call_function,convert_element_type.default,forward,6,1,1,1,689,5194,4
+919,view_148,call_function,view.default,forward,6,1,1,1,690,5193,4
+920,view_as_complex_13,call_function,view_as_complex.default,forward,6,1,1,1,691,5192,6
+921,view_149,call_function,view.default,forward,6,1,1,1,2,5203,3
+922,alias_default_180,call_function,alias.default,forward,6,1,1,4,3,5202,3
+923,mul_44,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8
+924,view_as_real_12,call_function,view_as_real.default,forward,6,1,1,1,695,5190,6
+925,view_150,call_function,view.default,forward,6,1,1,1,696,5189,6
+926,mul_45,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8
+927,view_as_real_13,call_function,view_as_real.default,forward,6,1,1,1,695,5190,6
+928,view_151,call_function,view.default,forward,6,1,1,1,696,5189,6
+929,convert_element_type_154,call_function,convert_element_type.default,forward,6,1,1,1,697,5188,6
+930,convert_element_type_155,call_function,convert_element_type.default,forward,6,1,1,1,697,5188,6
+931,permute_69,call_function,permute.default,forward,6,1,1,1,698,5187,6
+932,permute_70,call_function,permute.default,forward,6,1,1,1,698,5187,6
+933,permute_71,call_function,permute.default,forward,6,1,1,1,689,5187,4
+934,alias_default_181,call_function,alias.default,forward,6,1,1,2,699,5186,4
+935,alias_default_182,call_function,alias.default,forward,6,1,1,2,699,5186,4
+936,alias_default_183,call_function,alias.default,forward,6,1,1,2,690,5186,4
+937,_scaled_dot_product_flash_attention_6,call_function,_scaled_dot_product_flash_attention.default,forward,6,3,3,4,723,5185,2
+938,getitem_54,call_function,getitem,forward,6,1,1,1,724,5181,2
+939,getitem_55,call_function,getitem,forward,6,1,1,1,724,724,2
+940,getitem_60,call_function,getitem,forward,6,1,1,1,724,724,1
+941,getitem_61,call_function,getitem,forward,6,1,1,1,724,724,1
+942,alias_default_184,call_function,alias.default,forward,6,1,1,2,725,5180,4
+943,permute_72,call_function,permute.default,forward,6,1,1,1,726,5179,4
+944,view_152,call_function,view.default,forward,6,1,1,1,727,5178,3
+945,dtype_cast_59,call_function,dtype_cast.default,forward,6,1,1,1,1,5180,3
+946,permute_73,call_function,permute.default,forward,6,1,1,1,2,5179,3
+947,alias_default_185,call_function,alias.default,forward,6,1,1,2,728,5177,4
+948,alias_default_186,call_function,alias.default,forward,6,1,1,2,3,5178,3
+949,einsum_default_45,call_function,einsum.default,forward,6,2,2,1,733,5176,5
+950,add_31,call_function,add.Tensor,forward,6,2,2,1,734,5175,10
+951,dtype_cast_60,call_function,dtype_cast.default,forward,6,1,1,1,1,5164,2
+952,alias_default_187,call_function,alias.default,forward,6,1,1,3,735,5174,4
+953,convert_element_type_158,call_function,convert_element_type.default,forward,6,1,1,1,736,5172,4
+954,alias_default_189,call_function,alias.default,forward,6,1,1,2,737,5171,4
+955,pow_14,call_function,pow.Tensor_Scalar,forward,6,1,1,1,738,5170,4
+956,mean_13,call_function,mean.dim,forward,6,1,1,1,739,5169,4
+957,add_32,call_function,add.Scalar,forward,6,1,1,1,740,5168,3
+958,rsqrt_13,call_function,rsqrt.default,forward,6,1,1,1,741,5167,3
+959,alias_default_190,call_function,alias.default,forward,6,1,1,3,742,5166,3
+960,mul_46,call_function,mul.Tensor,forward,6,2,2,1,743,5162,8
+961,alias_default_188,call_function,alias.default,forward,6,1,1,2,2,5163,2
+962,mul_47,call_function,mul.Tensor,forward,6,2,2,1,747,5161,8
+963,convert_element_type_159,call_function,convert_element_type.default,forward,6,1,1,1,748,5160,6
+964,dtype_cast_61,call_function,dtype_cast.default,forward,6,1,1,1,1,5160,3
+965,permute_74,call_function,permute.default,forward,6,1,1,1,2,5159,3
+966,alias_default_191,call_function,alias.default,forward,6,1,1,4,749,5159,4
+967,alias_default_192,call_function,alias.default,forward,6,1,1,2,3,5158,3
+968,einsum_default_46,call_function,einsum.default,forward,6,2,2,1,754,5156,5
+969,alias_default_193,call_function,alias.default,forward,6,1,1,2,755,5155,4
+970,convert_element_type_162,call_function,convert_element_type.default,forward,6,1,1,1,756,5143,4
+971,alias_default_194,call_function,alias.default,forward,6,1,1,2,757,5142,4
+972,neg_6,call_function,neg.default,forward,6,1,1,1,758,5141,8
+973,exp_6,call_function,exp.default,forward,6,1,1,1,759,5140,6
+974,add_33,call_function,add.Tensor,forward,6,1,1,1,760,5139,4
+975,div_6,call_function,div.Tensor,forward,6,2,2,1,761,5138,6
+976,convert_element_type_163,call_function,convert_element_type.default,forward,6,1,1,1,762,5137,6
+977,dtype_cast_62,call_function,dtype_cast.default,forward,6,1,1,1,1,5141,3
+978,permute_75,call_function,permute.default,forward,6,1,1,1,2,5140,3
+979,alias_default_196,call_function,alias.default,forward,6,1,1,2,3,5139,3
+980,einsum_default_47,call_function,einsum.default,forward,6,2,2,1,754,5137,5
+981,alias_default_195,call_function,alias.default,forward,6,1,1,2,763,5136,4
+982,alias_default_197,call_function,alias.default,forward,6,1,1,2,755,5136,4
+983,mul_48,call_function,mul.Tensor,forward,6,2,2,1,770,5135,8
+984,dtype_cast_63,call_function,dtype_cast.default,forward,6,1,1,1,1,5137,3
+985,permute_76,call_function,permute.default,forward,6,1,1,1,2,5136,3
+986,alias_default_198,call_function,alias.default,forward,6,1,1,2,771,5134,4
+987,alias_default_199,call_function,alias.default,forward,6,1,1,2,3,5135,3
+988,einsum_default_48,call_function,einsum.default,forward,6,2,2,1,776,5133,5
+989,add_34,call_function,add.Tensor,forward,6,2,2,1,777,5132,10
+990,dtype_cast_64,call_function,dtype_cast.default,forward,7,1,1,1,1,5121,2
+991,alias_default_200,call_function,alias.default,forward,6,1,1,3,778,5131,4
+992,convert_element_type_168,call_function,convert_element_type.default,forward,7,1,1,1,779,5129,4
+993,alias_default_202,call_function,alias.default,forward,7,1,1,2,780,5128,4
+994,pow_15,call_function,pow.Tensor_Scalar,forward,7,1,1,1,781,5127,4
+995,mean_14,call_function,mean.dim,forward,7,1,1,1,782,5126,4
+996,add_35,call_function,add.Scalar,forward,7,1,1,1,783,5125,3
+997,rsqrt_14,call_function,rsqrt.default,forward,7,1,1,1,784,5124,3
+998,alias_default_203,call_function,alias.default,forward,7,1,1,3,785,5123,3
+999,mul_49,call_function,mul.Tensor,forward,7,2,2,1,786,5119,8
+1000,alias_default_201,call_function,alias.default,forward,7,1,1,2,2,5120,2
+1001,mul_50,call_function,mul.Tensor,forward,7,2,2,1,790,5118,8
+1002,convert_element_type_169,call_function,convert_element_type.default,forward,7,1,1,1,791,5117,6
+1003,dtype_cast_65,call_function,dtype_cast.default,forward,7,1,1,1,1,5104,3
+1004,permute_77,call_function,permute.default,forward,7,1,1,1,2,5103,3
+1005,alias_default_204,call_function,alias.default,forward,7,1,1,6,792,5116,4
+1006,alias_default_205,call_function,alias.default,forward,7,1,1,2,3,5102,3
+1007,einsum_default_49,call_function,einsum.default,forward,7,2,2,1,797,5100,5
+1008,dtype_cast_66,call_function,dtype_cast.default,forward,7,1,1,1,1,5104,3
+1009,permute_78,call_function,permute.default,forward,7,1,1,1,2,5103,3
+1010,alias_default_206,call_function,alias.default,forward,7,1,1,2,3,5102,3
+1011,einsum_default_50,call_function,einsum.default,forward,7,2,2,1,797,5100,5
+1012,dtype_cast_67,call_function,dtype_cast.default,forward,7,1,1,1,1,5097,3
+1013,permute_79,call_function,permute.default,forward,7,1,1,1,2,5096,3
+1014,alias_default_207,call_function,alias.default,forward,7,1,1,2,3,5095,3
+1015,einsum_default_51,call_function,einsum.default,forward,7,2,2,1,797,5093,5
+1016,view_167,call_function,view.default,forward,7,1,1,1,798,5099,4
+1017,view_168,call_function,view.default,forward,7,1,1,1,798,5099,4
+1018,view_169,call_function,view.default,forward,7,1,1,1,798,5092,4
+1019,convert_element_type_176,call_function,convert_element_type.default,forward,7,1,1,1,799,5098,4
+1020,view_170,call_function,view.default,forward,7,1,1,1,800,5097,4
+1021,view_as_complex_14,call_function,view_as_complex.default,forward,7,1,1,1,801,5096,6
+1022,convert_element_type_177,call_function,convert_element_type.default,forward,7,1,1,1,799,5098,4
+1023,view_171,call_function,view.default,forward,7,1,1,1,800,5097,4
+1024,view_as_complex_15,call_function,view_as_complex.default,forward,7,1,1,1,801,5096,6
+1025,view_172,call_function,view.default,forward,7,1,1,1,2,5107,3
+1026,alias_default_208,call_function,alias.default,forward,7,1,1,4,3,5106,3
+1027,mul_51,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8
+1028,view_as_real_14,call_function,view_as_real.default,forward,7,1,1,1,805,5094,6
+1029,view_173,call_function,view.default,forward,7,1,1,1,806,5093,6
+1030,mul_52,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8
+1031,view_as_real_15,call_function,view_as_real.default,forward,7,1,1,1,805,5094,6
+1032,view_174,call_function,view.default,forward,7,1,1,1,806,5093,6
+1033,convert_element_type_178,call_function,convert_element_type.default,forward,7,1,1,1,807,5092,6
+1034,convert_element_type_179,call_function,convert_element_type.default,forward,7,1,1,1,807,5092,6
+1035,permute_80,call_function,permute.default,forward,7,1,1,1,808,5091,6
+1036,permute_81,call_function,permute.default,forward,7,1,1,1,808,5091,6
+1037,permute_82,call_function,permute.default,forward,7,1,1,1,799,5091,4
+1038,alias_default_209,call_function,alias.default,forward,7,1,1,2,809,5090,4
+1039,alias_default_210,call_function,alias.default,forward,7,1,1,2,809,5090,4
+1040,alias_default_211,call_function,alias.default,forward,7,1,1,2,800,5090,4
+1041,_scaled_dot_product_flash_attention_7,call_function,_scaled_dot_product_flash_attention.default,forward,7,3,3,4,833,5089,2
+1042,getitem_63,call_function,getitem,forward,7,1,1,1,834,5085,2
+1043,getitem_64,call_function,getitem,forward,7,1,1,1,834,834,2
+1044,getitem_69,call_function,getitem,forward,7,1,1,1,834,834,1
+1045,getitem_70,call_function,getitem,forward,7,1,1,1,834,834,1
+1046,alias_default_212,call_function,alias.default,forward,7,1,1,2,835,5084,4
+1047,permute_83,call_function,permute.default,forward,7,1,1,1,836,5083,4
+1048,view_175,call_function,view.default,forward,7,1,1,1,837,5082,3
+1049,dtype_cast_68,call_function,dtype_cast.default,forward,7,1,1,1,1,5084,3
+1050,permute_84,call_function,permute.default,forward,7,1,1,1,2,5083,3
+1051,alias_default_213,call_function,alias.default,forward,7,1,1,2,838,5081,4
+1052,alias_default_214,call_function,alias.default,forward,7,1,1,2,3,5082,3
+1053,einsum_default_52,call_function,einsum.default,forward,7,2,2,1,843,5080,5
+1054,add_36,call_function,add.Tensor,forward,7,2,2,1,844,5079,10
+1055,dtype_cast_69,call_function,dtype_cast.default,forward,7,1,1,1,1,5068,2
+1056,alias_default_215,call_function,alias.default,forward,7,1,1,3,845,5078,4
+1057,convert_element_type_182,call_function,convert_element_type.default,forward,7,1,1,1,846,5076,4
+1058,alias_default_217,call_function,alias.default,forward,7,1,1,2,847,5075,4
+1059,pow_16,call_function,pow.Tensor_Scalar,forward,7,1,1,1,848,5074,4
+1060,mean_15,call_function,mean.dim,forward,7,1,1,1,849,5073,4
+1061,add_37,call_function,add.Scalar,forward,7,1,1,1,850,5072,3
+1062,rsqrt_15,call_function,rsqrt.default,forward,7,1,1,1,851,5071,3
+1063,alias_default_218,call_function,alias.default,forward,7,1,1,3,852,5070,3
+1064,mul_53,call_function,mul.Tensor,forward,7,2,2,1,853,5066,8
+1065,alias_default_216,call_function,alias.default,forward,7,1,1,2,2,5067,2
+1066,mul_54,call_function,mul.Tensor,forward,7,2,2,1,857,5065,8
+1067,convert_element_type_183,call_function,convert_element_type.default,forward,7,1,1,1,858,5064,6
+1068,dtype_cast_70,call_function,dtype_cast.default,forward,7,1,1,1,1,5064,3
+1069,permute_85,call_function,permute.default,forward,7,1,1,1,2,5063,3
+1070,alias_default_219,call_function,alias.default,forward,7,1,1,4,859,5063,4
+1071,alias_default_220,call_function,alias.default,forward,7,1,1,2,3,5062,3
+1072,einsum_default_53,call_function,einsum.default,forward,7,2,2,1,864,5060,5
+1073,alias_default_221,call_function,alias.default,forward,7,1,1,2,865,5059,4
+1074,convert_element_type_186,call_function,convert_element_type.default,forward,7,1,1,1,866,5047,4
+1075,alias_default_222,call_function,alias.default,forward,7,1,1,2,867,5046,4
+1076,neg_7,call_function,neg.default,forward,7,1,1,1,868,5045,8
+1077,exp_7,call_function,exp.default,forward,7,1,1,1,869,5044,6
+1078,add_38,call_function,add.Tensor,forward,7,1,1,1,870,5043,4
+1079,div_7,call_function,div.Tensor,forward,7,2,2,1,871,5042,6
+1080,convert_element_type_187,call_function,convert_element_type.default,forward,7,1,1,1,872,5041,6
+1081,dtype_cast_71,call_function,dtype_cast.default,forward,7,1,1,1,1,5045,3
+1082,permute_86,call_function,permute.default,forward,7,1,1,1,2,5044,3
+1083,alias_default_224,call_function,alias.default,forward,7,1,1,2,3,5043,3
+1084,einsum_default_54,call_function,einsum.default,forward,7,2,2,1,864,5041,5
+1085,alias_default_223,call_function,alias.default,forward,7,1,1,2,873,5040,4
+1086,alias_default_225,call_function,alias.default,forward,7,1,1,2,865,5040,4
+1087,mul_55,call_function,mul.Tensor,forward,7,2,2,1,880,5039,8
+1088,dtype_cast_72,call_function,dtype_cast.default,forward,7,1,1,1,1,5041,3
+1089,permute_87,call_function,permute.default,forward,7,1,1,1,2,5040,3
+1090,alias_default_226,call_function,alias.default,forward,7,1,1,2,881,5038,4
+1091,alias_default_227,call_function,alias.default,forward,7,1,1,2,3,5039,3
+1092,einsum_default_55,call_function,einsum.default,forward,7,2,2,1,886,5037,5
+1093,add_39,call_function,add.Tensor,forward,7,2,2,1,887,5036,10
+1094,dtype_cast_73,call_function,dtype_cast.default,forward,8,1,1,1,1,5025,2
+1095,alias_default_228,call_function,alias.default,forward,7,1,1,3,888,5035,4
+1096,convert_element_type_192,call_function,convert_element_type.default,forward,8,1,1,1,889,5033,4
+1097,alias_default_230,call_function,alias.default,forward,8,1,1,2,890,5032,4
+1098,pow_17,call_function,pow.Tensor_Scalar,forward,8,1,1,1,891,5031,4
+1099,mean_16,call_function,mean.dim,forward,8,1,1,1,892,5030,4
+1100,add_40,call_function,add.Scalar,forward,8,1,1,1,893,5029,3
+1101,rsqrt_16,call_function,rsqrt.default,forward,8,1,1,1,894,5028,3
+1102,alias_default_231,call_function,alias.default,forward,8,1,1,3,895,5027,3
+1103,mul_56,call_function,mul.Tensor,forward,8,2,2,1,896,5023,8
+1104,alias_default_229,call_function,alias.default,forward,8,1,1,2,2,5024,2
+1105,mul_57,call_function,mul.Tensor,forward,8,2,2,1,900,5022,8
+1106,convert_element_type_193,call_function,convert_element_type.default,forward,8,1,1,1,901,5021,6
+1107,dtype_cast_74,call_function,dtype_cast.default,forward,8,1,1,1,1,5008,3
+1108,permute_88,call_function,permute.default,forward,8,1,1,1,2,5007,3
+1109,alias_default_232,call_function,alias.default,forward,8,1,1,6,902,5020,4
+1110,alias_default_233,call_function,alias.default,forward,8,1,1,2,3,5006,3
+1111,einsum_default_56,call_function,einsum.default,forward,8,2,2,1,907,5004,5
+1112,dtype_cast_75,call_function,dtype_cast.default,forward,8,1,1,1,1,5008,3
+1113,permute_89,call_function,permute.default,forward,8,1,1,1,2,5007,3
+1114,alias_default_234,call_function,alias.default,forward,8,1,1,2,3,5006,3
+1115,einsum_default_57,call_function,einsum.default,forward,8,2,2,1,907,5004,5
+1116,dtype_cast_76,call_function,dtype_cast.default,forward,8,1,1,1,1,5001,3
+1117,permute_90,call_function,permute.default,forward,8,1,1,1,2,5000,3
+1118,alias_default_235,call_function,alias.default,forward,8,1,1,2,3,4999,3
+1119,einsum_default_58,call_function,einsum.default,forward,8,2,2,1,907,4997,5
+1120,view_190,call_function,view.default,forward,8,1,1,1,908,5003,4
+1121,view_191,call_function,view.default,forward,8,1,1,1,908,5003,4
+1122,view_192,call_function,view.default,forward,8,1,1,1,908,4996,4
+1123,convert_element_type_200,call_function,convert_element_type.default,forward,8,1,1,1,909,5002,4
+1124,view_193,call_function,view.default,forward,8,1,1,1,910,5001,4
+1125,view_as_complex_16,call_function,view_as_complex.default,forward,8,1,1,1,911,5000,6
+1126,convert_element_type_201,call_function,convert_element_type.default,forward,8,1,1,1,909,5002,4
+1127,view_194,call_function,view.default,forward,8,1,1,1,910,5001,4
+1128,view_as_complex_17,call_function,view_as_complex.default,forward,8,1,1,1,911,5000,6
+1129,view_195,call_function,view.default,forward,8,1,1,1,2,5011,3
+1130,alias_default_236,call_function,alias.default,forward,8,1,1,4,3,5010,3
+1131,mul_58,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8
+1132,view_as_real_16,call_function,view_as_real.default,forward,8,1,1,1,915,4998,6
+1133,view_196,call_function,view.default,forward,8,1,1,1,916,4997,6
+1134,mul_59,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8
+1135,view_as_real_17,call_function,view_as_real.default,forward,8,1,1,1,915,4998,6
+1136,view_197,call_function,view.default,forward,8,1,1,1,916,4997,6
+1137,convert_element_type_202,call_function,convert_element_type.default,forward,8,1,1,1,917,4996,6
+1138,convert_element_type_203,call_function,convert_element_type.default,forward,8,1,1,1,917,4996,6
+1139,permute_91,call_function,permute.default,forward,8,1,1,1,918,4995,6
+1140,permute_92,call_function,permute.default,forward,8,1,1,1,918,4995,6
+1141,permute_93,call_function,permute.default,forward,8,1,1,1,909,4995,4
+1142,alias_default_237,call_function,alias.default,forward,8,1,1,2,919,4994,4
+1143,alias_default_238,call_function,alias.default,forward,8,1,1,2,919,4994,4
+1144,alias_default_239,call_function,alias.default,forward,8,1,1,2,910,4994,4
+1145,_scaled_dot_product_flash_attention_8,call_function,_scaled_dot_product_flash_attention.default,forward,8,3,3,4,943,4993,2
+1146,getitem_72,call_function,getitem,forward,8,1,1,1,944,4989,2
+1147,getitem_73,call_function,getitem,forward,8,1,1,1,944,944,2
+1148,getitem_78,call_function,getitem,forward,8,1,1,1,944,944,1
+1149,getitem_79,call_function,getitem,forward,8,1,1,1,944,944,1
+1150,alias_default_240,call_function,alias.default,forward,8,1,1,2,945,4988,4
+1151,permute_94,call_function,permute.default,forward,8,1,1,1,946,4987,4
+1152,view_198,call_function,view.default,forward,8,1,1,1,947,4986,3
+1153,dtype_cast_77,call_function,dtype_cast.default,forward,8,1,1,1,1,4988,3
+1154,permute_95,call_function,permute.default,forward,8,1,1,1,2,4987,3
+1155,alias_default_241,call_function,alias.default,forward,8,1,1,2,948,4985,4
+1156,alias_default_242,call_function,alias.default,forward,8,1,1,2,3,4986,3
+1157,einsum_default_59,call_function,einsum.default,forward,8,2,2,1,953,4984,5
+1158,add_41,call_function,add.Tensor,forward,8,2,2,1,954,4983,10
+1159,dtype_cast_78,call_function,dtype_cast.default,forward,8,1,1,1,1,4972,2
+1160,alias_default_243,call_function,alias.default,forward,8,1,1,3,955,4982,4
+1161,convert_element_type_206,call_function,convert_element_type.default,forward,8,1,1,1,956,4980,4
+1162,alias_default_245,call_function,alias.default,forward,8,1,1,2,957,4979,4
+1163,pow_18,call_function,pow.Tensor_Scalar,forward,8,1,1,1,958,4978,4
+1164,mean_17,call_function,mean.dim,forward,8,1,1,1,959,4977,4
+1165,add_42,call_function,add.Scalar,forward,8,1,1,1,960,4976,3
+1166,rsqrt_17,call_function,rsqrt.default,forward,8,1,1,1,961,4975,3
+1167,alias_default_246,call_function,alias.default,forward,8,1,1,3,962,4974,3
+1168,mul_60,call_function,mul.Tensor,forward,8,2,2,1,963,4970,8
+1169,alias_default_244,call_function,alias.default,forward,8,1,1,2,2,4971,2
+1170,mul_61,call_function,mul.Tensor,forward,8,2,2,1,967,4969,8
+1171,convert_element_type_207,call_function,convert_element_type.default,forward,8,1,1,1,968,4968,6
+1172,dtype_cast_79,call_function,dtype_cast.default,forward,8,1,1,1,1,4968,3
+1173,permute_96,call_function,permute.default,forward,8,1,1,1,2,4967,3
+1174,alias_default_247,call_function,alias.default,forward,8,1,1,4,969,4967,4
+1175,alias_default_248,call_function,alias.default,forward,8,1,1,2,3,4966,3
+1176,einsum_default_60,call_function,einsum.default,forward,8,2,2,1,974,4964,5
+1177,alias_default_249,call_function,alias.default,forward,8,1,1,2,975,4963,4
+1178,convert_element_type_210,call_function,convert_element_type.default,forward,8,1,1,1,976,4951,4
+1179,alias_default_250,call_function,alias.default,forward,8,1,1,2,977,4950,4
+1180,neg_8,call_function,neg.default,forward,8,1,1,1,978,4949,8
+1181,exp_8,call_function,exp.default,forward,8,1,1,1,979,4948,6
+1182,add_43,call_function,add.Tensor,forward,8,1,1,1,980,4947,4
+1183,div_8,call_function,div.Tensor,forward,8,2,2,1,981,4946,6
+1184,convert_element_type_211,call_function,convert_element_type.default,forward,8,1,1,1,982,4945,6
+1185,dtype_cast_80,call_function,dtype_cast.default,forward,8,1,1,1,1,4949,3
+1186,permute_97,call_function,permute.default,forward,8,1,1,1,2,4948,3
+1187,alias_default_252,call_function,alias.default,forward,8,1,1,2,3,4947,3
+1188,einsum_default_61,call_function,einsum.default,forward,8,2,2,1,974,4945,5
+1189,alias_default_251,call_function,alias.default,forward,8,1,1,2,983,4944,4
+1190,alias_default_253,call_function,alias.default,forward,8,1,1,2,975,4944,4
+1191,mul_62,call_function,mul.Tensor,forward,8,2,2,1,990,4943,8
+1192,dtype_cast_81,call_function,dtype_cast.default,forward,8,1,1,1,1,4945,3
+1193,permute_98,call_function,permute.default,forward,8,1,1,1,2,4944,3
+1194,alias_default_254,call_function,alias.default,forward,8,1,1,2,991,4942,4
+1195,alias_default_255,call_function,alias.default,forward,8,1,1,2,3,4943,3
+1196,einsum_default_62,call_function,einsum.default,forward,8,2,2,1,996,4941,5
+1197,add_44,call_function,add.Tensor,forward,8,2,2,1,997,4940,10
+1198,dtype_cast_82,call_function,dtype_cast.default,forward,9,1,1,1,1,4929,2
+1199,alias_default_256,call_function,alias.default,forward,8,1,1,3,998,4939,4
+1200,convert_element_type_216,call_function,convert_element_type.default,forward,9,1,1,1,999,4937,4
+1201,alias_default_258,call_function,alias.default,forward,9,1,1,2,1000,4936,4
+1202,pow_19,call_function,pow.Tensor_Scalar,forward,9,1,1,1,1001,4935,4
+1203,mean_18,call_function,mean.dim,forward,9,1,1,1,1002,4934,4
+1204,add_45,call_function,add.Scalar,forward,9,1,1,1,1003,4933,3
+1205,rsqrt_18,call_function,rsqrt.default,forward,9,1,1,1,1004,4932,3
+1206,alias_default_259,call_function,alias.default,forward,9,1,1,3,1005,4931,3
+1207,mul_63,call_function,mul.Tensor,forward,9,2,2,1,1006,4927,8
+1208,alias_default_257,call_function,alias.default,forward,9,1,1,2,2,4928,2
+1209,mul_64,call_function,mul.Tensor,forward,9,2,2,1,1010,4926,8
+1210,convert_element_type_217,call_function,convert_element_type.default,forward,9,1,1,1,1011,4925,6
+1211,dtype_cast_83,call_function,dtype_cast.default,forward,9,1,1,1,1,4912,3
+1212,permute_99,call_function,permute.default,forward,9,1,1,1,2,4911,3
+1213,alias_default_260,call_function,alias.default,forward,9,1,1,6,1012,4924,4
+1214,alias_default_261,call_function,alias.default,forward,9,1,1,2,3,4910,3
+1215,einsum_default_63,call_function,einsum.default,forward,9,2,2,1,1017,4908,5
+1216,dtype_cast_84,call_function,dtype_cast.default,forward,9,1,1,1,1,4912,3
+1217,permute_100,call_function,permute.default,forward,9,1,1,1,2,4911,3
+1218,alias_default_262,call_function,alias.default,forward,9,1,1,2,3,4910,3
+1219,einsum_default_64,call_function,einsum.default,forward,9,2,2,1,1017,4908,5
+1220,dtype_cast_85,call_function,dtype_cast.default,forward,9,1,1,1,1,4905,3
+1221,permute_101,call_function,permute.default,forward,9,1,1,1,2,4904,3
+1222,alias_default_263,call_function,alias.default,forward,9,1,1,2,3,4903,3
+1223,einsum_default_65,call_function,einsum.default,forward,9,2,2,1,1017,4901,5
+1224,view_213,call_function,view.default,forward,9,1,1,1,1018,4907,4
+1225,view_214,call_function,view.default,forward,9,1,1,1,1018,4907,4
+1226,view_215,call_function,view.default,forward,9,1,1,1,1018,4900,4
+1227,convert_element_type_224,call_function,convert_element_type.default,forward,9,1,1,1,1019,4906,4
+1228,view_216,call_function,view.default,forward,9,1,1,1,1020,4905,4
+1229,view_as_complex_18,call_function,view_as_complex.default,forward,9,1,1,1,1021,4904,6
+1230,convert_element_type_225,call_function,convert_element_type.default,forward,9,1,1,1,1019,4906,4
+1231,view_217,call_function,view.default,forward,9,1,1,1,1020,4905,4
+1232,view_as_complex_19,call_function,view_as_complex.default,forward,9,1,1,1,1021,4904,6
+1233,view_218,call_function,view.default,forward,9,1,1,1,2,4915,3
+1234,alias_default_264,call_function,alias.default,forward,9,1,1,4,3,4914,3
+1235,mul_65,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8
+1236,view_as_real_18,call_function,view_as_real.default,forward,9,1,1,1,1025,4902,6
+1237,view_219,call_function,view.default,forward,9,1,1,1,1026,4901,6
+1238,mul_66,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8
+1239,view_as_real_19,call_function,view_as_real.default,forward,9,1,1,1,1025,4902,6
+1240,view_220,call_function,view.default,forward,9,1,1,1,1026,4901,6
+1241,convert_element_type_226,call_function,convert_element_type.default,forward,9,1,1,1,1027,4900,6
+1242,convert_element_type_227,call_function,convert_element_type.default,forward,9,1,1,1,1027,4900,6
+1243,permute_102,call_function,permute.default,forward,9,1,1,1,1028,4899,6
+1244,permute_103,call_function,permute.default,forward,9,1,1,1,1028,4899,6
+1245,permute_104,call_function,permute.default,forward,9,1,1,1,1019,4899,4
+1246,alias_default_265,call_function,alias.default,forward,9,1,1,2,1029,4898,4
+1247,alias_default_266,call_function,alias.default,forward,9,1,1,2,1029,4898,4
+1248,alias_default_267,call_function,alias.default,forward,9,1,1,2,1020,4898,4
+1249,_scaled_dot_product_flash_attention_9,call_function,_scaled_dot_product_flash_attention.default,forward,9,3,3,4,1053,4897,2
+1250,getitem_81,call_function,getitem,forward,9,1,1,1,1054,4893,2
+1251,getitem_82,call_function,getitem,forward,9,1,1,1,1054,1054,2
+1252,getitem_87,call_function,getitem,forward,9,1,1,1,1054,1054,1
+1253,getitem_88,call_function,getitem,forward,9,1,1,1,1054,1054,1
+1254,alias_default_268,call_function,alias.default,forward,9,1,1,2,1055,4892,4
+1255,permute_105,call_function,permute.default,forward,9,1,1,1,1056,4891,4
+1256,view_221,call_function,view.default,forward,9,1,1,1,1057,4890,3
+1257,dtype_cast_86,call_function,dtype_cast.default,forward,9,1,1,1,1,4892,3
+1258,permute_106,call_function,permute.default,forward,9,1,1,1,2,4891,3
+1259,alias_default_269,call_function,alias.default,forward,9,1,1,2,1058,4889,4
+1260,alias_default_270,call_function,alias.default,forward,9,1,1,2,3,4890,3
+1261,einsum_default_66,call_function,einsum.default,forward,9,2,2,1,1063,4888,5
+1262,add_46,call_function,add.Tensor,forward,9,2,2,1,1064,4887,10
+1263,dtype_cast_87,call_function,dtype_cast.default,forward,9,1,1,1,1,4876,2
+1264,alias_default_271,call_function,alias.default,forward,9,1,1,3,1065,4886,4
+1265,convert_element_type_230,call_function,convert_element_type.default,forward,9,1,1,1,1066,4884,4
+1266,alias_default_273,call_function,alias.default,forward,9,1,1,2,1067,4883,4
+1267,pow_20,call_function,pow.Tensor_Scalar,forward,9,1,1,1,1068,4882,4
+1268,mean_19,call_function,mean.dim,forward,9,1,1,1,1069,4881,4
+1269,add_47,call_function,add.Scalar,forward,9,1,1,1,1070,4880,3
+1270,rsqrt_19,call_function,rsqrt.default,forward,9,1,1,1,1071,4879,3
+1271,alias_default_274,call_function,alias.default,forward,9,1,1,3,1072,4878,3
+1272,mul_67,call_function,mul.Tensor,forward,9,2,2,1,1073,4874,8
+1273,alias_default_272,call_function,alias.default,forward,9,1,1,2,2,4875,2
+1274,mul_68,call_function,mul.Tensor,forward,9,2,2,1,1077,4873,8
+1275,convert_element_type_231,call_function,convert_element_type.default,forward,9,1,1,1,1078,4872,6
+1276,dtype_cast_88,call_function,dtype_cast.default,forward,9,1,1,1,1,4872,3
+1277,permute_107,call_function,permute.default,forward,9,1,1,1,2,4871,3
+1278,alias_default_275,call_function,alias.default,forward,9,1,1,4,1079,4871,4
+1279,alias_default_276,call_function,alias.default,forward,9,1,1,2,3,4870,3
+1280,einsum_default_67,call_function,einsum.default,forward,9,2,2,1,1084,4868,5
+1281,alias_default_277,call_function,alias.default,forward,9,1,1,2,1085,4867,4
+1282,convert_element_type_234,call_function,convert_element_type.default,forward,9,1,1,1,1086,4855,4
+1283,alias_default_278,call_function,alias.default,forward,9,1,1,2,1087,4854,4
+1284,neg_9,call_function,neg.default,forward,9,1,1,1,1088,4853,8
+1285,exp_9,call_function,exp.default,forward,9,1,1,1,1089,4852,6
+1286,add_48,call_function,add.Tensor,forward,9,1,1,1,1090,4851,4
+1287,div_9,call_function,div.Tensor,forward,9,2,2,1,1091,4850,6
+1288,convert_element_type_235,call_function,convert_element_type.default,forward,9,1,1,1,1092,4849,6
+1289,dtype_cast_89,call_function,dtype_cast.default,forward,9,1,1,1,1,4853,3
+1290,permute_108,call_function,permute.default,forward,9,1,1,1,2,4852,3
+1291,alias_default_280,call_function,alias.default,forward,9,1,1,2,3,4851,3
+1292,einsum_default_68,call_function,einsum.default,forward,9,2,2,1,1084,4849,5
+1293,alias_default_279,call_function,alias.default,forward,9,1,1,2,1093,4848,4
+1294,alias_default_281,call_function,alias.default,forward,9,1,1,2,1085,4848,4
+1295,mul_69,call_function,mul.Tensor,forward,9,2,2,1,1100,4847,8
+1296,dtype_cast_90,call_function,dtype_cast.default,forward,9,1,1,1,1,4849,3
+1297,permute_109,call_function,permute.default,forward,9,1,1,1,2,4848,3
+1298,alias_default_282,call_function,alias.default,forward,9,1,1,2,1101,4846,4
+1299,alias_default_283,call_function,alias.default,forward,9,1,1,2,3,4847,3
+1300,einsum_default_69,call_function,einsum.default,forward,9,2,2,1,1106,4845,5
+1301,add_49,call_function,add.Tensor,forward,9,2,2,1,1107,4844,10
+1302,dtype_cast_91,call_function,dtype_cast.default,forward,10,1,1,1,1,4833,2
+1303,alias_default_284,call_function,alias.default,forward,9,1,1,3,1108,4843,4
+1304,convert_element_type_240,call_function,convert_element_type.default,forward,10,1,1,1,1109,4841,4
+1305,alias_default_286,call_function,alias.default,forward,10,1,1,2,1110,4840,4
+1306,pow_21,call_function,pow.Tensor_Scalar,forward,10,1,1,1,1111,4839,4
+1307,mean_20,call_function,mean.dim,forward,10,1,1,1,1112,4838,4
+1308,add_50,call_function,add.Scalar,forward,10,1,1,1,1113,4837,3
+1309,rsqrt_20,call_function,rsqrt.default,forward,10,1,1,1,1114,4836,3
+1310,alias_default_287,call_function,alias.default,forward,10,1,1,3,1115,4835,3
+1311,mul_70,call_function,mul.Tensor,forward,10,2,2,1,1116,4831,8
+1312,alias_default_285,call_function,alias.default,forward,10,1,1,2,2,4832,2
+1313,mul_71,call_function,mul.Tensor,forward,10,2,2,1,1120,4830,8
+1314,convert_element_type_241,call_function,convert_element_type.default,forward,10,1,1,1,1121,4829,6
+1315,dtype_cast_92,call_function,dtype_cast.default,forward,10,1,1,1,1,4816,3
+1316,permute_110,call_function,permute.default,forward,10,1,1,1,2,4815,3
+1317,alias_default_288,call_function,alias.default,forward,10,1,1,6,1122,4828,4
+1318,alias_default_289,call_function,alias.default,forward,10,1,1,2,3,4814,3
+1319,einsum_default_70,call_function,einsum.default,forward,10,2,2,1,1127,4812,5
+1320,dtype_cast_93,call_function,dtype_cast.default,forward,10,1,1,1,1,4816,3
+1321,permute_111,call_function,permute.default,forward,10,1,1,1,2,4815,3
+1322,alias_default_290,call_function,alias.default,forward,10,1,1,2,3,4814,3
+1323,einsum_default_71,call_function,einsum.default,forward,10,2,2,1,1127,4812,5
+1324,dtype_cast_94,call_function,dtype_cast.default,forward,10,1,1,1,1,4809,3
+1325,permute_112,call_function,permute.default,forward,10,1,1,1,2,4808,3
+1326,alias_default_291,call_function,alias.default,forward,10,1,1,2,3,4807,3
+1327,einsum_default_72,call_function,einsum.default,forward,10,2,2,1,1127,4805,5
+1328,view_236,call_function,view.default,forward,10,1,1,1,1128,4811,4
+1329,view_237,call_function,view.default,forward,10,1,1,1,1128,4811,4
+1330,view_238,call_function,view.default,forward,10,1,1,1,1128,4804,4
+1331,convert_element_type_248,call_function,convert_element_type.default,forward,10,1,1,1,1129,4810,4
+1332,view_239,call_function,view.default,forward,10,1,1,1,1130,4809,4
+1333,view_as_complex_20,call_function,view_as_complex.default,forward,10,1,1,1,1131,4808,6
+1334,convert_element_type_249,call_function,convert_element_type.default,forward,10,1,1,1,1129,4810,4
+1335,view_240,call_function,view.default,forward,10,1,1,1,1130,4809,4
+1336,view_as_complex_21,call_function,view_as_complex.default,forward,10,1,1,1,1131,4808,6
+1337,view_241,call_function,view.default,forward,10,1,1,1,2,4819,3
+1338,alias_default_292,call_function,alias.default,forward,10,1,1,4,3,4818,3
+1339,mul_72,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8
+1340,view_as_real_20,call_function,view_as_real.default,forward,10,1,1,1,1135,4806,6
+1341,view_242,call_function,view.default,forward,10,1,1,1,1136,4805,6
+1342,mul_73,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8
+1343,view_as_real_21,call_function,view_as_real.default,forward,10,1,1,1,1135,4806,6
+1344,view_243,call_function,view.default,forward,10,1,1,1,1136,4805,6
+1345,convert_element_type_250,call_function,convert_element_type.default,forward,10,1,1,1,1137,4804,6
+1346,convert_element_type_251,call_function,convert_element_type.default,forward,10,1,1,1,1137,4804,6
+1347,permute_113,call_function,permute.default,forward,10,1,1,1,1138,4803,6
+1348,permute_114,call_function,permute.default,forward,10,1,1,1,1138,4803,6
+1349,permute_115,call_function,permute.default,forward,10,1,1,1,1129,4803,4
+1350,alias_default_293,call_function,alias.default,forward,10,1,1,2,1139,4802,4
+1351,alias_default_294,call_function,alias.default,forward,10,1,1,2,1139,4802,4
+1352,alias_default_295,call_function,alias.default,forward,10,1,1,2,1130,4802,4
+1353,_scaled_dot_product_flash_attention_10,call_function,_scaled_dot_product_flash_attention.default,forward,10,3,3,4,1163,4801,2
+1354,getitem_90,call_function,getitem,forward,10,1,1,1,1164,4797,2
+1355,getitem_91,call_function,getitem,forward,10,1,1,1,1164,1164,2
+1356,getitem_96,call_function,getitem,forward,10,1,1,1,1164,1164,1
+1357,getitem_97,call_function,getitem,forward,10,1,1,1,1164,1164,1
+1358,alias_default_296,call_function,alias.default,forward,10,1,1,2,1165,4796,4
+1359,permute_116,call_function,permute.default,forward,10,1,1,1,1166,4795,4
+1360,view_244,call_function,view.default,forward,10,1,1,1,1167,4794,3
+1361,dtype_cast_95,call_function,dtype_cast.default,forward,10,1,1,1,1,4796,3
+1362,permute_117,call_function,permute.default,forward,10,1,1,1,2,4795,3
+1363,alias_default_297,call_function,alias.default,forward,10,1,1,2,1168,4793,4
+1364,alias_default_298,call_function,alias.default,forward,10,1,1,2,3,4794,3
+1365,einsum_default_73,call_function,einsum.default,forward,10,2,2,1,1173,4792,5
+1366,add_51,call_function,add.Tensor,forward,10,2,2,1,1174,4791,10
+1367,dtype_cast_96,call_function,dtype_cast.default,forward,10,1,1,1,1,4780,2
+1368,alias_default_299,call_function,alias.default,forward,10,1,1,3,1175,4790,4
+1369,convert_element_type_254,call_function,convert_element_type.default,forward,10,1,1,1,1176,4788,4
+1370,alias_default_301,call_function,alias.default,forward,10,1,1,2,1177,4787,4
+1371,pow_22,call_function,pow.Tensor_Scalar,forward,10,1,1,1,1178,4786,4
+1372,mean_21,call_function,mean.dim,forward,10,1,1,1,1179,4785,4
+1373,add_52,call_function,add.Scalar,forward,10,1,1,1,1180,4784,3
+1374,rsqrt_21,call_function,rsqrt.default,forward,10,1,1,1,1181,4783,3
+1375,alias_default_302,call_function,alias.default,forward,10,1,1,3,1182,4782,3
+1376,mul_74,call_function,mul.Tensor,forward,10,2,2,1,1183,4778,8
+1377,alias_default_300,call_function,alias.default,forward,10,1,1,2,2,4779,2
+1378,mul_75,call_function,mul.Tensor,forward,10,2,2,1,1187,4777,8
+1379,convert_element_type_255,call_function,convert_element_type.default,forward,10,1,1,1,1188,4776,6
+1380,dtype_cast_97,call_function,dtype_cast.default,forward,10,1,1,1,1,4776,3
+1381,permute_118,call_function,permute.default,forward,10,1,1,1,2,4775,3
+1382,alias_default_303,call_function,alias.default,forward,10,1,1,4,1189,4775,4
+1383,alias_default_304,call_function,alias.default,forward,10,1,1,2,3,4774,3
+1384,einsum_default_74,call_function,einsum.default,forward,10,2,2,1,1194,4772,5
+1385,alias_default_305,call_function,alias.default,forward,10,1,1,2,1195,4771,4
+1386,convert_element_type_258,call_function,convert_element_type.default,forward,10,1,1,1,1196,4759,4
+1387,alias_default_306,call_function,alias.default,forward,10,1,1,2,1197,4758,4
+1388,neg_10,call_function,neg.default,forward,10,1,1,1,1198,4757,8
+1389,exp_10,call_function,exp.default,forward,10,1,1,1,1199,4756,6
+1390,add_53,call_function,add.Tensor,forward,10,1,1,1,1200,4755,4
+1391,div_10,call_function,div.Tensor,forward,10,2,2,1,1201,4754,6
+1392,convert_element_type_259,call_function,convert_element_type.default,forward,10,1,1,1,1202,4753,6
+1393,dtype_cast_98,call_function,dtype_cast.default,forward,10,1,1,1,1,4757,3
+1394,permute_119,call_function,permute.default,forward,10,1,1,1,2,4756,3
+1395,alias_default_308,call_function,alias.default,forward,10,1,1,2,3,4755,3
+1396,einsum_default_75,call_function,einsum.default,forward,10,2,2,1,1194,4753,5
+1397,alias_default_307,call_function,alias.default,forward,10,1,1,2,1203,4752,4
+1398,alias_default_309,call_function,alias.default,forward,10,1,1,2,1195,4752,4
+1399,mul_76,call_function,mul.Tensor,forward,10,2,2,1,1210,4751,8
+1400,dtype_cast_99,call_function,dtype_cast.default,forward,10,1,1,1,1,4753,3
+1401,permute_120,call_function,permute.default,forward,10,1,1,1,2,4752,3
+1402,alias_default_310,call_function,alias.default,forward,10,1,1,2,1211,4750,4
+1403,alias_default_311,call_function,alias.default,forward,10,1,1,2,3,4751,3
+1404,einsum_default_76,call_function,einsum.default,forward,10,2,2,1,1216,4749,5
+1405,add_54,call_function,add.Tensor,forward,10,2,2,1,1217,4748,10
+1406,dtype_cast_100,call_function,dtype_cast.default,forward,11,1,1,1,1,4737,2
+1407,alias_default_312,call_function,alias.default,forward,10,1,1,3,1218,4747,4
+1408,convert_element_type_264,call_function,convert_element_type.default,forward,11,1,1,1,1219,4745,4
+1409,alias_default_314,call_function,alias.default,forward,11,1,1,2,1220,4744,4
+1410,pow_23,call_function,pow.Tensor_Scalar,forward,11,1,1,1,1221,4743,4
+1411,mean_22,call_function,mean.dim,forward,11,1,1,1,1222,4742,4
+1412,add_55,call_function,add.Scalar,forward,11,1,1,1,1223,4741,3
+1413,rsqrt_22,call_function,rsqrt.default,forward,11,1,1,1,1224,4740,3
+1414,alias_default_315,call_function,alias.default,forward,11,1,1,3,1225,4739,3
+1415,mul_77,call_function,mul.Tensor,forward,11,2,2,1,1226,4735,8
+1416,alias_default_313,call_function,alias.default,forward,11,1,1,2,2,4736,2
+1417,mul_78,call_function,mul.Tensor,forward,11,2,2,1,1230,4734,8
+1418,convert_element_type_265,call_function,convert_element_type.default,forward,11,1,1,1,1231,4733,6
+1419,dtype_cast_101,call_function,dtype_cast.default,forward,11,1,1,1,1,4720,3
+1420,permute_121,call_function,permute.default,forward,11,1,1,1,2,4719,3
+1421,alias_default_316,call_function,alias.default,forward,11,1,1,6,1232,4732,4
+1422,alias_default_317,call_function,alias.default,forward,11,1,1,2,3,4718,3
+1423,einsum_default_77,call_function,einsum.default,forward,11,2,2,1,1237,4716,5
+1424,dtype_cast_102,call_function,dtype_cast.default,forward,11,1,1,1,1,4720,3
+1425,permute_122,call_function,permute.default,forward,11,1,1,1,2,4719,3
+1426,alias_default_318,call_function,alias.default,forward,11,1,1,2,3,4718,3
+1427,einsum_default_78,call_function,einsum.default,forward,11,2,2,1,1237,4716,5
+1428,dtype_cast_103,call_function,dtype_cast.default,forward,11,1,1,1,1,4713,3
+1429,permute_123,call_function,permute.default,forward,11,1,1,1,2,4712,3
+1430,alias_default_319,call_function,alias.default,forward,11,1,1,2,3,4711,3
+1431,einsum_default_79,call_function,einsum.default,forward,11,2,2,1,1237,4709,5
+1432,view_259,call_function,view.default,forward,11,1,1,1,1238,4715,4
+1433,view_260,call_function,view.default,forward,11,1,1,1,1238,4715,4
+1434,view_261,call_function,view.default,forward,11,1,1,1,1238,4708,4
+1435,convert_element_type_272,call_function,convert_element_type.default,forward,11,1,1,1,1239,4714,4
+1436,view_262,call_function,view.default,forward,11,1,1,1,1240,4713,4
+1437,view_as_complex_22,call_function,view_as_complex.default,forward,11,1,1,1,1241,4712,6
+1438,convert_element_type_273,call_function,convert_element_type.default,forward,11,1,1,1,1239,4714,4
+1439,view_263,call_function,view.default,forward,11,1,1,1,1240,4713,4
+1440,view_as_complex_23,call_function,view_as_complex.default,forward,11,1,1,1,1241,4712,6
+1441,view_264,call_function,view.default,forward,11,1,1,1,2,4723,3
+1442,alias_default_320,call_function,alias.default,forward,11,1,1,4,3,4722,3
+1443,mul_79,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8
+1444,view_as_real_22,call_function,view_as_real.default,forward,11,1,1,1,1245,4710,6
+1445,view_265,call_function,view.default,forward,11,1,1,1,1246,4709,6
+1446,mul_80,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8
+1447,view_as_real_23,call_function,view_as_real.default,forward,11,1,1,1,1245,4710,6
+1448,view_266,call_function,view.default,forward,11,1,1,1,1246,4709,6
+1449,convert_element_type_274,call_function,convert_element_type.default,forward,11,1,1,1,1247,4708,6
+1450,convert_element_type_275,call_function,convert_element_type.default,forward,11,1,1,1,1247,4708,6
+1451,permute_124,call_function,permute.default,forward,11,1,1,1,1248,4707,6
+1452,permute_125,call_function,permute.default,forward,11,1,1,1,1248,4707,6
+1453,permute_126,call_function,permute.default,forward,11,1,1,1,1239,4707,4
+1454,alias_default_321,call_function,alias.default,forward,11,1,1,2,1249,4706,4
+1455,alias_default_322,call_function,alias.default,forward,11,1,1,2,1249,4706,4
+1456,alias_default_323,call_function,alias.default,forward,11,1,1,2,1240,4706,4
+1457,_scaled_dot_product_flash_attention_11,call_function,_scaled_dot_product_flash_attention.default,forward,11,3,3,4,1273,4705,2
+1458,getitem_99,call_function,getitem,forward,11,1,1,1,1274,4701,2
+1459,getitem_100,call_function,getitem,forward,11,1,1,1,1274,1274,2
+1460,getitem_105,call_function,getitem,forward,11,1,1,1,1274,1274,1
+1461,getitem_106,call_function,getitem,forward,11,1,1,1,1274,1274,1
+1462,alias_default_324,call_function,alias.default,forward,11,1,1,2,1275,4700,4
+1463,permute_127,call_function,permute.default,forward,11,1,1,1,1276,4699,4
+1464,view_267,call_function,view.default,forward,11,1,1,1,1277,4698,3
+1465,dtype_cast_104,call_function,dtype_cast.default,forward,11,1,1,1,1,4700,3
+1466,permute_128,call_function,permute.default,forward,11,1,1,1,2,4699,3
+1467,alias_default_325,call_function,alias.default,forward,11,1,1,2,1278,4697,4
+1468,alias_default_326,call_function,alias.default,forward,11,1,1,2,3,4698,3
+1469,einsum_default_80,call_function,einsum.default,forward,11,2,2,1,1283,4696,5
+1470,add_56,call_function,add.Tensor,forward,11,2,2,1,1284,4695,10
+1471,dtype_cast_105,call_function,dtype_cast.default,forward,11,1,1,1,1,4684,2
+1472,alias_default_327,call_function,alias.default,forward,11,1,1,3,1285,4694,4
+1473,convert_element_type_278,call_function,convert_element_type.default,forward,11,1,1,1,1286,4692,4
+1474,alias_default_329,call_function,alias.default,forward,11,1,1,2,1287,4691,4
+1475,pow_24,call_function,pow.Tensor_Scalar,forward,11,1,1,1,1288,4690,4
+1476,mean_23,call_function,mean.dim,forward,11,1,1,1,1289,4689,4
+1477,add_57,call_function,add.Scalar,forward,11,1,1,1,1290,4688,3
+1478,rsqrt_23,call_function,rsqrt.default,forward,11,1,1,1,1291,4687,3
+1479,alias_default_330,call_function,alias.default,forward,11,1,1,3,1292,4686,3
+1480,mul_81,call_function,mul.Tensor,forward,11,2,2,1,1293,4682,8
+1481,alias_default_328,call_function,alias.default,forward,11,1,1,2,2,4683,2
+1482,mul_82,call_function,mul.Tensor,forward,11,2,2,1,1297,4681,8
+1483,convert_element_type_279,call_function,convert_element_type.default,forward,11,1,1,1,1298,4680,6
+1484,dtype_cast_106,call_function,dtype_cast.default,forward,11,1,1,1,1,4680,3
+1485,permute_129,call_function,permute.default,forward,11,1,1,1,2,4679,3
+1486,alias_default_331,call_function,alias.default,forward,11,1,1,4,1299,4679,4
+1487,alias_default_332,call_function,alias.default,forward,11,1,1,2,3,4678,3
+1488,einsum_default_81,call_function,einsum.default,forward,11,2,2,1,1304,4676,5
+1489,alias_default_333,call_function,alias.default,forward,11,1,1,2,1305,4675,4
+1490,convert_element_type_282,call_function,convert_element_type.default,forward,11,1,1,1,1306,4663,4
+1491,alias_default_334,call_function,alias.default,forward,11,1,1,2,1307,4662,4
+1492,neg_11,call_function,neg.default,forward,11,1,1,1,1308,4661,8
+1493,exp_11,call_function,exp.default,forward,11,1,1,1,1309,4660,6
+1494,add_58,call_function,add.Tensor,forward,11,1,1,1,1310,4659,4
+1495,div_11,call_function,div.Tensor,forward,11,2,2,1,1311,4658,6
+1496,convert_element_type_283,call_function,convert_element_type.default,forward,11,1,1,1,1312,4657,6
+1497,dtype_cast_107,call_function,dtype_cast.default,forward,11,1,1,1,1,4661,3
+1498,permute_130,call_function,permute.default,forward,11,1,1,1,2,4660,3
+1499,alias_default_336,call_function,alias.default,forward,11,1,1,2,3,4659,3
+1500,einsum_default_82,call_function,einsum.default,forward,11,2,2,1,1304,4657,5
+1501,alias_default_335,call_function,alias.default,forward,11,1,1,2,1313,4656,4
+1502,alias_default_337,call_function,alias.default,forward,11,1,1,2,1305,4656,4
+1503,mul_83,call_function,mul.Tensor,forward,11,2,2,1,1320,4655,8
+1504,dtype_cast_108,call_function,dtype_cast.default,forward,11,1,1,1,1,4657,3
+1505,permute_131,call_function,permute.default,forward,11,1,1,1,2,4656,3
+1506,alias_default_338,call_function,alias.default,forward,11,1,1,2,1321,4654,4
+1507,alias_default_339,call_function,alias.default,forward,11,1,1,2,3,4655,3
+1508,einsum_default_83,call_function,einsum.default,forward,11,2,2,1,1326,4653,5
+1509,add_59,call_function,add.Tensor,forward,11,2,2,1,1327,4652,10
+1510,dtype_cast_109,call_function,dtype_cast.default,forward,12,1,1,1,1,4641,2
+1511,alias_default_340,call_function,alias.default,forward,11,1,1,3,1328,4651,4
+1512,convert_element_type_288,call_function,convert_element_type.default,forward,12,1,1,1,1329,4649,4
+1513,alias_default_342,call_function,alias.default,forward,12,1,1,2,1330,4648,4
+1514,pow_25,call_function,pow.Tensor_Scalar,forward,12,1,1,1,1331,4647,4
+1515,mean_24,call_function,mean.dim,forward,12,1,1,1,1332,4646,4
+1516,add_60,call_function,add.Scalar,forward,12,1,1,1,1333,4645,3
+1517,rsqrt_24,call_function,rsqrt.default,forward,12,1,1,1,1334,4644,3
+1518,alias_default_343,call_function,alias.default,forward,12,1,1,3,1335,4643,3
+1519,mul_84,call_function,mul.Tensor,forward,12,2,2,1,1336,4639,8
+1520,alias_default_341,call_function,alias.default,forward,12,1,1,2,2,4640,2
+1521,mul_85,call_function,mul.Tensor,forward,12,2,2,1,1340,4638,8
+1522,convert_element_type_289,call_function,convert_element_type.default,forward,12,1,1,1,1341,4637,6
+1523,dtype_cast_110,call_function,dtype_cast.default,forward,12,1,1,1,1,4624,3
+1524,permute_132,call_function,permute.default,forward,12,1,1,1,2,4623,3
+1525,alias_default_344,call_function,alias.default,forward,12,1,1,6,1342,4636,4
+1526,alias_default_345,call_function,alias.default,forward,12,1,1,2,3,4622,3
+1527,einsum_default_84,call_function,einsum.default,forward,12,2,2,1,1347,4620,5
+1528,dtype_cast_111,call_function,dtype_cast.default,forward,12,1,1,1,1,4624,3
+1529,permute_133,call_function,permute.default,forward,12,1,1,1,2,4623,3
+1530,alias_default_346,call_function,alias.default,forward,12,1,1,2,3,4622,3
+1531,einsum_default_85,call_function,einsum.default,forward,12,2,2,1,1347,4620,5
+1532,dtype_cast_112,call_function,dtype_cast.default,forward,12,1,1,1,1,4617,3
+1533,permute_134,call_function,permute.default,forward,12,1,1,1,2,4616,3
+1534,alias_default_347,call_function,alias.default,forward,12,1,1,2,3,4615,3
+1535,einsum_default_86,call_function,einsum.default,forward,12,2,2,1,1347,4613,5
+1536,view_282,call_function,view.default,forward,12,1,1,1,1348,4619,4
+1537,view_283,call_function,view.default,forward,12,1,1,1,1348,4619,4
+1538,view_284,call_function,view.default,forward,12,1,1,1,1348,4612,4
+1539,convert_element_type_296,call_function,convert_element_type.default,forward,12,1,1,1,1349,4618,4
+1540,view_285,call_function,view.default,forward,12,1,1,1,1350,4617,4
+1541,view_as_complex_24,call_function,view_as_complex.default,forward,12,1,1,1,1351,4616,6
+1542,convert_element_type_297,call_function,convert_element_type.default,forward,12,1,1,1,1349,4618,4
+1543,view_286,call_function,view.default,forward,12,1,1,1,1350,4617,4
+1544,view_as_complex_25,call_function,view_as_complex.default,forward,12,1,1,1,1351,4616,6
+1545,view_287,call_function,view.default,forward,12,1,1,1,2,4627,3
+1546,alias_default_348,call_function,alias.default,forward,12,1,1,4,3,4626,3
+1547,mul_86,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8
+1548,view_as_real_24,call_function,view_as_real.default,forward,12,1,1,1,1355,4614,6
+1549,view_288,call_function,view.default,forward,12,1,1,1,1356,4613,6
+1550,mul_87,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8
+1551,view_as_real_25,call_function,view_as_real.default,forward,12,1,1,1,1355,4614,6
+1552,view_289,call_function,view.default,forward,12,1,1,1,1356,4613,6
+1553,convert_element_type_298,call_function,convert_element_type.default,forward,12,1,1,1,1357,4612,6
+1554,convert_element_type_299,call_function,convert_element_type.default,forward,12,1,1,1,1357,4612,6
+1555,permute_135,call_function,permute.default,forward,12,1,1,1,1358,4611,6
+1556,permute_136,call_function,permute.default,forward,12,1,1,1,1358,4611,6
+1557,permute_137,call_function,permute.default,forward,12,1,1,1,1349,4611,4
+1558,alias_default_349,call_function,alias.default,forward,12,1,1,2,1359,4610,4
+1559,alias_default_350,call_function,alias.default,forward,12,1,1,2,1359,4610,4
+1560,alias_default_351,call_function,alias.default,forward,12,1,1,2,1350,4610,4
+1561,_scaled_dot_product_flash_attention_12,call_function,_scaled_dot_product_flash_attention.default,forward,12,3,3,4,1383,4609,2
+1562,getitem_108,call_function,getitem,forward,12,1,1,1,1384,4605,2
+1563,getitem_109,call_function,getitem,forward,12,1,1,1,1384,1384,2
+1564,getitem_114,call_function,getitem,forward,12,1,1,1,1384,1384,1
+1565,getitem_115,call_function,getitem,forward,12,1,1,1,1384,1384,1
+1566,alias_default_352,call_function,alias.default,forward,12,1,1,2,1385,4604,4
+1567,permute_138,call_function,permute.default,forward,12,1,1,1,1386,4603,4
+1568,view_290,call_function,view.default,forward,12,1,1,1,1387,4602,3
+1569,dtype_cast_113,call_function,dtype_cast.default,forward,12,1,1,1,1,4604,3
+1570,permute_139,call_function,permute.default,forward,12,1,1,1,2,4603,3
+1571,alias_default_353,call_function,alias.default,forward,12,1,1,2,1388,4601,4
+1572,alias_default_354,call_function,alias.default,forward,12,1,1,2,3,4602,3
+1573,einsum_default_87,call_function,einsum.default,forward,12,2,2,1,1393,4600,5
+1574,add_61,call_function,add.Tensor,forward,12,2,2,1,1394,4599,10
+1575,dtype_cast_114,call_function,dtype_cast.default,forward,12,1,1,1,1,4588,2
+1576,alias_default_355,call_function,alias.default,forward,12,1,1,3,1395,4598,4
+1577,convert_element_type_302,call_function,convert_element_type.default,forward,12,1,1,1,1396,4596,4
+1578,alias_default_357,call_function,alias.default,forward,12,1,1,2,1397,4595,4
+1579,pow_26,call_function,pow.Tensor_Scalar,forward,12,1,1,1,1398,4594,4
+1580,mean_25,call_function,mean.dim,forward,12,1,1,1,1399,4593,4
+1581,add_62,call_function,add.Scalar,forward,12,1,1,1,1400,4592,3
+1582,rsqrt_25,call_function,rsqrt.default,forward,12,1,1,1,1401,4591,3
+1583,alias_default_358,call_function,alias.default,forward,12,1,1,3,1402,4590,3
+1584,mul_88,call_function,mul.Tensor,forward,12,2,2,1,1403,4586,8
+1585,alias_default_356,call_function,alias.default,forward,12,1,1,2,2,4587,2
+1586,mul_89,call_function,mul.Tensor,forward,12,2,2,1,1407,4585,8
+1587,convert_element_type_303,call_function,convert_element_type.default,forward,12,1,1,1,1408,4584,6
+1588,dtype_cast_115,call_function,dtype_cast.default,forward,12,1,1,1,1,4584,3
+1589,permute_140,call_function,permute.default,forward,12,1,1,1,2,4583,3
+1590,alias_default_359,call_function,alias.default,forward,12,1,1,4,1409,4583,4
+1591,alias_default_360,call_function,alias.default,forward,12,1,1,2,3,4582,3
+1592,einsum_default_88,call_function,einsum.default,forward,12,2,2,1,1414,4580,5
+1593,alias_default_361,call_function,alias.default,forward,12,1,1,2,1415,4579,4
+1594,convert_element_type_306,call_function,convert_element_type.default,forward,12,1,1,1,1416,4567,4
+1595,alias_default_362,call_function,alias.default,forward,12,1,1,2,1417,4566,4
+1596,neg_12,call_function,neg.default,forward,12,1,1,1,1418,4565,8
+1597,exp_12,call_function,exp.default,forward,12,1,1,1,1419,4564,6
+1598,add_63,call_function,add.Tensor,forward,12,1,1,1,1420,4563,4
+1599,div_12,call_function,div.Tensor,forward,12,2,2,1,1421,4562,6
+1600,convert_element_type_307,call_function,convert_element_type.default,forward,12,1,1,1,1422,4561,6
+1601,dtype_cast_116,call_function,dtype_cast.default,forward,12,1,1,1,1,4565,3
+1602,permute_141,call_function,permute.default,forward,12,1,1,1,2,4564,3
+1603,alias_default_364,call_function,alias.default,forward,12,1,1,2,3,4563,3
+1604,einsum_default_89,call_function,einsum.default,forward,12,2,2,1,1414,4561,5
+1605,alias_default_363,call_function,alias.default,forward,12,1,1,2,1423,4560,4
+1606,alias_default_365,call_function,alias.default,forward,12,1,1,2,1415,4560,4
+1607,mul_90,call_function,mul.Tensor,forward,12,2,2,1,1430,4559,8
+1608,dtype_cast_117,call_function,dtype_cast.default,forward,12,1,1,1,1,4561,3
+1609,permute_142,call_function,permute.default,forward,12,1,1,1,2,4560,3
+1610,alias_default_366,call_function,alias.default,forward,12,1,1,2,1431,4558,4
+1611,alias_default_367,call_function,alias.default,forward,12,1,1,2,3,4559,3
+1612,einsum_default_90,call_function,einsum.default,forward,12,2,2,1,1436,4557,5
+1613,add_64,call_function,add.Tensor,forward,12,2,2,1,1437,4556,10
+1614,dtype_cast_118,call_function,dtype_cast.default,forward,13,1,1,1,1,4545,2
+1615,alias_default_368,call_function,alias.default,forward,12,1,1,3,1438,4555,4
+1616,convert_element_type_312,call_function,convert_element_type.default,forward,13,1,1,1,1439,4553,4
+1617,alias_default_370,call_function,alias.default,forward,13,1,1,2,1440,4552,4
+1618,pow_27,call_function,pow.Tensor_Scalar,forward,13,1,1,1,1441,4551,4
+1619,mean_26,call_function,mean.dim,forward,13,1,1,1,1442,4550,4
+1620,add_65,call_function,add.Scalar,forward,13,1,1,1,1443,4549,3
+1621,rsqrt_26,call_function,rsqrt.default,forward,13,1,1,1,1444,4548,3
+1622,alias_default_371,call_function,alias.default,forward,13,1,1,3,1445,4547,3
+1623,mul_91,call_function,mul.Tensor,forward,13,2,2,1,1446,4543,8
+1624,alias_default_369,call_function,alias.default,forward,13,1,1,2,2,4544,2
+1625,mul_92,call_function,mul.Tensor,forward,13,2,2,1,1450,4542,8
+1626,convert_element_type_313,call_function,convert_element_type.default,forward,13,1,1,1,1451,4541,6
+1627,dtype_cast_119,call_function,dtype_cast.default,forward,13,1,1,1,1,4528,3
+1628,permute_143,call_function,permute.default,forward,13,1,1,1,2,4527,3
+1629,alias_default_372,call_function,alias.default,forward,13,1,1,6,1452,4540,4
+1630,alias_default_373,call_function,alias.default,forward,13,1,1,2,3,4526,3
+1631,einsum_default_91,call_function,einsum.default,forward,13,2,2,1,1457,4524,5
+1632,dtype_cast_120,call_function,dtype_cast.default,forward,13,1,1,1,1,4528,3
+1633,permute_144,call_function,permute.default,forward,13,1,1,1,2,4527,3
+1634,alias_default_374,call_function,alias.default,forward,13,1,1,2,3,4526,3
+1635,einsum_default_92,call_function,einsum.default,forward,13,2,2,1,1457,4524,5
+1636,dtype_cast_121,call_function,dtype_cast.default,forward,13,1,1,1,1,4521,3
+1637,permute_145,call_function,permute.default,forward,13,1,1,1,2,4520,3
+1638,alias_default_375,call_function,alias.default,forward,13,1,1,2,3,4519,3
+1639,einsum_default_93,call_function,einsum.default,forward,13,2,2,1,1457,4517,5
+1640,view_305,call_function,view.default,forward,13,1,1,1,1458,4523,4
+1641,view_306,call_function,view.default,forward,13,1,1,1,1458,4523,4
+1642,view_307,call_function,view.default,forward,13,1,1,1,1458,4516,4
+1643,convert_element_type_320,call_function,convert_element_type.default,forward,13,1,1,1,1459,4522,4
+1644,view_308,call_function,view.default,forward,13,1,1,1,1460,4521,4
+1645,view_as_complex_26,call_function,view_as_complex.default,forward,13,1,1,1,1461,4520,6
+1646,convert_element_type_321,call_function,convert_element_type.default,forward,13,1,1,1,1459,4522,4
+1647,view_309,call_function,view.default,forward,13,1,1,1,1460,4521,4
+1648,view_as_complex_27,call_function,view_as_complex.default,forward,13,1,1,1,1461,4520,6
+1649,view_310,call_function,view.default,forward,13,1,1,1,2,4531,3
+1650,alias_default_376,call_function,alias.default,forward,13,1,1,4,3,4530,3
+1651,mul_93,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8
+1652,view_as_real_26,call_function,view_as_real.default,forward,13,1,1,1,1465,4518,6
+1653,view_311,call_function,view.default,forward,13,1,1,1,1466,4517,6
+1654,mul_94,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8
+1655,view_as_real_27,call_function,view_as_real.default,forward,13,1,1,1,1465,4518,6
+1656,view_312,call_function,view.default,forward,13,1,1,1,1466,4517,6
+1657,convert_element_type_322,call_function,convert_element_type.default,forward,13,1,1,1,1467,4516,6
+1658,convert_element_type_323,call_function,convert_element_type.default,forward,13,1,1,1,1467,4516,6
+1659,permute_146,call_function,permute.default,forward,13,1,1,1,1468,4515,6
+1660,permute_147,call_function,permute.default,forward,13,1,1,1,1468,4515,6
+1661,permute_148,call_function,permute.default,forward,13,1,1,1,1459,4515,4
+1662,alias_default_377,call_function,alias.default,forward,13,1,1,2,1469,4514,4
+1663,alias_default_378,call_function,alias.default,forward,13,1,1,2,1469,4514,4
+1664,alias_default_379,call_function,alias.default,forward,13,1,1,2,1460,4514,4
+1665,_scaled_dot_product_flash_attention_13,call_function,_scaled_dot_product_flash_attention.default,forward,13,3,3,4,1493,4513,2
+1666,getitem_117,call_function,getitem,forward,13,1,1,1,1494,4509,2
+1667,getitem_118,call_function,getitem,forward,13,1,1,1,1494,1494,2
+1668,getitem_123,call_function,getitem,forward,13,1,1,1,1494,1494,1
+1669,getitem_124,call_function,getitem,forward,13,1,1,1,1494,1494,1
+1670,alias_default_380,call_function,alias.default,forward,13,1,1,2,1495,4508,4
+1671,permute_149,call_function,permute.default,forward,13,1,1,1,1496,4507,4
+1672,view_313,call_function,view.default,forward,13,1,1,1,1497,4506,3
+1673,dtype_cast_122,call_function,dtype_cast.default,forward,13,1,1,1,1,4508,3
+1674,permute_150,call_function,permute.default,forward,13,1,1,1,2,4507,3
+1675,alias_default_381,call_function,alias.default,forward,13,1,1,2,1498,4505,4
+1676,alias_default_382,call_function,alias.default,forward,13,1,1,2,3,4506,3
+1677,einsum_default_94,call_function,einsum.default,forward,13,2,2,1,1503,4504,5
+1678,add_66,call_function,add.Tensor,forward,13,2,2,1,1504,4503,10
+1679,dtype_cast_123,call_function,dtype_cast.default,forward,13,1,1,1,1,4492,2
+1680,alias_default_383,call_function,alias.default,forward,13,1,1,3,1505,4502,4
+1681,convert_element_type_326,call_function,convert_element_type.default,forward,13,1,1,1,1506,4500,4
+1682,alias_default_385,call_function,alias.default,forward,13,1,1,2,1507,4499,4
+1683,pow_28,call_function,pow.Tensor_Scalar,forward,13,1,1,1,1508,4498,4
+1684,mean_27,call_function,mean.dim,forward,13,1,1,1,1509,4497,4
+1685,add_67,call_function,add.Scalar,forward,13,1,1,1,1510,4496,3
+1686,rsqrt_27,call_function,rsqrt.default,forward,13,1,1,1,1511,4495,3
+1687,alias_default_386,call_function,alias.default,forward,13,1,1,3,1512,4494,3
+1688,mul_95,call_function,mul.Tensor,forward,13,2,2,1,1513,4490,8
+1689,alias_default_384,call_function,alias.default,forward,13,1,1,2,2,4491,2
+1690,mul_96,call_function,mul.Tensor,forward,13,2,2,1,1517,4489,8
+1691,convert_element_type_327,call_function,convert_element_type.default,forward,13,1,1,1,1518,4488,6
+1692,dtype_cast_124,call_function,dtype_cast.default,forward,13,1,1,1,1,4488,3
+1693,permute_151,call_function,permute.default,forward,13,1,1,1,2,4487,3
+1694,alias_default_387,call_function,alias.default,forward,13,1,1,4,1519,4487,4
+1695,alias_default_388,call_function,alias.default,forward,13,1,1,2,3,4486,3
+1696,einsum_default_95,call_function,einsum.default,forward,13,2,2,1,1524,4484,5
+1697,alias_default_389,call_function,alias.default,forward,13,1,1,2,1525,4483,4
+1698,convert_element_type_330,call_function,convert_element_type.default,forward,13,1,1,1,1526,4471,4
+1699,alias_default_390,call_function,alias.default,forward,13,1,1,2,1527,4470,4
+1700,neg_13,call_function,neg.default,forward,13,1,1,1,1528,4469,8
+1701,exp_13,call_function,exp.default,forward,13,1,1,1,1529,4468,6
+1702,add_68,call_function,add.Tensor,forward,13,1,1,1,1530,4467,4
+1703,div_13,call_function,div.Tensor,forward,13,2,2,1,1531,4466,6
+1704,convert_element_type_331,call_function,convert_element_type.default,forward,13,1,1,1,1532,4465,6
+1705,dtype_cast_125,call_function,dtype_cast.default,forward,13,1,1,1,1,4469,3
+1706,permute_152,call_function,permute.default,forward,13,1,1,1,2,4468,3
+1707,alias_default_392,call_function,alias.default,forward,13,1,1,2,3,4467,3
+1708,einsum_default_96,call_function,einsum.default,forward,13,2,2,1,1524,4465,5
+1709,alias_default_391,call_function,alias.default,forward,13,1,1,2,1533,4464,4
+1710,alias_default_393,call_function,alias.default,forward,13,1,1,2,1525,4464,4
+1711,mul_97,call_function,mul.Tensor,forward,13,2,2,1,1540,4463,8
+1712,dtype_cast_126,call_function,dtype_cast.default,forward,13,1,1,1,1,4465,3
+1713,permute_153,call_function,permute.default,forward,13,1,1,1,2,4464,3
+1714,alias_default_394,call_function,alias.default,forward,13,1,1,2,1541,4462,4
+1715,alias_default_395,call_function,alias.default,forward,13,1,1,2,3,4463,3
+1716,einsum_default_97,call_function,einsum.default,forward,13,2,2,1,1546,4461,5
+1717,add_69,call_function,add.Tensor,forward,13,2,2,1,1547,4460,10
+1718,dtype_cast_127,call_function,dtype_cast.default,forward,14,1,1,1,1,4449,2
+1719,alias_default_396,call_function,alias.default,forward,13,1,1,3,1548,4459,4
+1720,convert_element_type_336,call_function,convert_element_type.default,forward,14,1,1,1,1549,4457,4
+1721,alias_default_398,call_function,alias.default,forward,14,1,1,2,1550,4456,4
+1722,pow_29,call_function,pow.Tensor_Scalar,forward,14,1,1,1,1551,4455,4
+1723,mean_28,call_function,mean.dim,forward,14,1,1,1,1552,4454,4
+1724,add_70,call_function,add.Scalar,forward,14,1,1,1,1553,4453,3
+1725,rsqrt_28,call_function,rsqrt.default,forward,14,1,1,1,1554,4452,3
+1726,alias_default_399,call_function,alias.default,forward,14,1,1,3,1555,4451,3
+1727,mul_98,call_function,mul.Tensor,forward,14,2,2,1,1556,4447,8
+1728,alias_default_397,call_function,alias.default,forward,14,1,1,2,2,4448,2
+1729,mul_99,call_function,mul.Tensor,forward,14,2,2,1,1560,4446,8
+1730,convert_element_type_337,call_function,convert_element_type.default,forward,14,1,1,1,1561,4445,6
+1731,dtype_cast_128,call_function,dtype_cast.default,forward,14,1,1,1,1,4432,3
+1732,permute_154,call_function,permute.default,forward,14,1,1,1,2,4431,3
+1733,alias_default_400,call_function,alias.default,forward,14,1,1,6,1562,4444,4
+1734,alias_default_401,call_function,alias.default,forward,14,1,1,2,3,4430,3
+1735,einsum_default_98,call_function,einsum.default,forward,14,2,2,1,1567,4428,5
+1736,dtype_cast_129,call_function,dtype_cast.default,forward,14,1,1,1,1,4432,3
+1737,permute_155,call_function,permute.default,forward,14,1,1,1,2,4431,3
+1738,alias_default_402,call_function,alias.default,forward,14,1,1,2,3,4430,3
+1739,einsum_default_99,call_function,einsum.default,forward,14,2,2,1,1567,4428,5
+1740,dtype_cast_130,call_function,dtype_cast.default,forward,14,1,1,1,1,4425,3
+1741,permute_156,call_function,permute.default,forward,14,1,1,1,2,4424,3
+1742,alias_default_403,call_function,alias.default,forward,14,1,1,2,3,4423,3
+1743,einsum_default_100,call_function,einsum.default,forward,14,2,2,1,1567,4421,5
+1744,view_328,call_function,view.default,forward,14,1,1,1,1568,4427,4
+1745,view_329,call_function,view.default,forward,14,1,1,1,1568,4427,4
+1746,view_330,call_function,view.default,forward,14,1,1,1,1568,4420,4
+1747,convert_element_type_344,call_function,convert_element_type.default,forward,14,1,1,1,1569,4426,4
+1748,view_331,call_function,view.default,forward,14,1,1,1,1570,4425,4
+1749,view_as_complex_28,call_function,view_as_complex.default,forward,14,1,1,1,1571,4424,6
+1750,convert_element_type_345,call_function,convert_element_type.default,forward,14,1,1,1,1569,4426,4
+1751,view_332,call_function,view.default,forward,14,1,1,1,1570,4425,4
+1752,view_as_complex_29,call_function,view_as_complex.default,forward,14,1,1,1,1571,4424,6
+1753,view_333,call_function,view.default,forward,14,1,1,1,2,4435,3
+1754,alias_default_404,call_function,alias.default,forward,14,1,1,4,3,4434,3
+1755,mul_100,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8
+1756,view_as_real_28,call_function,view_as_real.default,forward,14,1,1,1,1575,4422,6
+1757,view_334,call_function,view.default,forward,14,1,1,1,1576,4421,6
+1758,mul_101,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8
+1759,view_as_real_29,call_function,view_as_real.default,forward,14,1,1,1,1575,4422,6
+1760,view_335,call_function,view.default,forward,14,1,1,1,1576,4421,6
+1761,convert_element_type_346,call_function,convert_element_type.default,forward,14,1,1,1,1577,4420,6
+1762,convert_element_type_347,call_function,convert_element_type.default,forward,14,1,1,1,1577,4420,6
+1763,permute_157,call_function,permute.default,forward,14,1,1,1,1578,4419,6
+1764,permute_158,call_function,permute.default,forward,14,1,1,1,1578,4419,6
+1765,permute_159,call_function,permute.default,forward,14,1,1,1,1569,4419,4
+1766,alias_default_405,call_function,alias.default,forward,14,1,1,2,1579,4418,4
+1767,alias_default_406,call_function,alias.default,forward,14,1,1,2,1579,4418,4
+1768,alias_default_407,call_function,alias.default,forward,14,1,1,2,1570,4418,4
+1769,_scaled_dot_product_flash_attention_14,call_function,_scaled_dot_product_flash_attention.default,forward,14,3,3,4,1603,4417,2
+1770,getitem_126,call_function,getitem,forward,14,1,1,1,1604,4413,2
+1771,getitem_127,call_function,getitem,forward,14,1,1,1,1604,1604,2
+1772,getitem_132,call_function,getitem,forward,14,1,1,1,1604,1604,1
+1773,getitem_133,call_function,getitem,forward,14,1,1,1,1604,1604,1
+1774,alias_default_408,call_function,alias.default,forward,14,1,1,2,1605,4412,4
+1775,permute_160,call_function,permute.default,forward,14,1,1,1,1606,4411,4
+1776,view_336,call_function,view.default,forward,14,1,1,1,1607,4410,3
+1777,dtype_cast_131,call_function,dtype_cast.default,forward,14,1,1,1,1,4412,3
+1778,permute_161,call_function,permute.default,forward,14,1,1,1,2,4411,3
+1779,alias_default_409,call_function,alias.default,forward,14,1,1,2,1608,4409,4
+1780,alias_default_410,call_function,alias.default,forward,14,1,1,2,3,4410,3
+1781,einsum_default_101,call_function,einsum.default,forward,14,2,2,1,1613,4408,5
+1782,add_71,call_function,add.Tensor,forward,14,2,2,1,1614,4407,10
+1783,dtype_cast_132,call_function,dtype_cast.default,forward,14,1,1,1,1,4396,2
+1784,alias_default_411,call_function,alias.default,forward,14,1,1,3,1615,4406,4
+1785,convert_element_type_350,call_function,convert_element_type.default,forward,14,1,1,1,1616,4404,4
+1786,alias_default_413,call_function,alias.default,forward,14,1,1,2,1617,4403,4
+1787,pow_30,call_function,pow.Tensor_Scalar,forward,14,1,1,1,1618,4402,4
+1788,mean_29,call_function,mean.dim,forward,14,1,1,1,1619,4401,4
+1789,add_72,call_function,add.Scalar,forward,14,1,1,1,1620,4400,3
+1790,rsqrt_29,call_function,rsqrt.default,forward,14,1,1,1,1621,4399,3
+1791,alias_default_414,call_function,alias.default,forward,14,1,1,3,1622,4398,3
+1792,mul_102,call_function,mul.Tensor,forward,14,2,2,1,1623,4394,8
+1793,alias_default_412,call_function,alias.default,forward,14,1,1,2,2,4395,2
+1794,mul_103,call_function,mul.Tensor,forward,14,2,2,1,1627,4393,8
+1795,convert_element_type_351,call_function,convert_element_type.default,forward,14,1,1,1,1628,4392,6
+1796,dtype_cast_133,call_function,dtype_cast.default,forward,14,1,1,1,1,4392,3
+1797,permute_162,call_function,permute.default,forward,14,1,1,1,2,4391,3
+1798,alias_default_415,call_function,alias.default,forward,14,1,1,4,1629,4391,4
+1799,alias_default_416,call_function,alias.default,forward,14,1,1,2,3,4390,3
+1800,einsum_default_102,call_function,einsum.default,forward,14,2,2,1,1634,4388,5
+1801,alias_default_417,call_function,alias.default,forward,14,1,1,2,1635,4387,4
+1802,convert_element_type_354,call_function,convert_element_type.default,forward,14,1,1,1,1636,4375,4
+1803,alias_default_418,call_function,alias.default,forward,14,1,1,2,1637,4374,4
+1804,neg_14,call_function,neg.default,forward,14,1,1,1,1638,4373,8
+1805,exp_14,call_function,exp.default,forward,14,1,1,1,1639,4372,6
+1806,add_73,call_function,add.Tensor,forward,14,1,1,1,1640,4371,4
+1807,div_14,call_function,div.Tensor,forward,14,2,2,1,1641,4370,6
+1808,convert_element_type_355,call_function,convert_element_type.default,forward,14,1,1,1,1642,4369,6
+1809,dtype_cast_134,call_function,dtype_cast.default,forward,14,1,1,1,1,4373,3
+1810,permute_163,call_function,permute.default,forward,14,1,1,1,2,4372,3
+1811,alias_default_420,call_function,alias.default,forward,14,1,1,2,3,4371,3
+1812,einsum_default_103,call_function,einsum.default,forward,14,2,2,1,1634,4369,5
+1813,alias_default_419,call_function,alias.default,forward,14,1,1,2,1643,4368,4
+1814,alias_default_421,call_function,alias.default,forward,14,1,1,2,1635,4368,4
+1815,mul_104,call_function,mul.Tensor,forward,14,2,2,1,1650,4367,8
+1816,dtype_cast_135,call_function,dtype_cast.default,forward,14,1,1,1,1,4369,3
+1817,permute_164,call_function,permute.default,forward,14,1,1,1,2,4368,3
+1818,alias_default_422,call_function,alias.default,forward,14,1,1,2,1651,4366,4
+1819,alias_default_423,call_function,alias.default,forward,14,1,1,2,3,4367,3
+1820,einsum_default_104,call_function,einsum.default,forward,14,2,2,1,1656,4365,5
+1821,add_74,call_function,add.Tensor,forward,14,2,2,1,1657,4364,10
+1822,dtype_cast_136,call_function,dtype_cast.default,forward,15,1,1,1,1,4353,2
+1823,alias_default_424,call_function,alias.default,forward,14,1,1,3,1658,4363,4
+1824,convert_element_type_360,call_function,convert_element_type.default,forward,15,1,1,1,1659,4361,4
+1825,alias_default_426,call_function,alias.default,forward,15,1,1,2,1660,4360,4
+1826,pow_31,call_function,pow.Tensor_Scalar,forward,15,1,1,1,1661,4359,4
+1827,mean_30,call_function,mean.dim,forward,15,1,1,1,1662,4358,4
+1828,add_75,call_function,add.Scalar,forward,15,1,1,1,1663,4357,3
+1829,rsqrt_30,call_function,rsqrt.default,forward,15,1,1,1,1664,4356,3
+1830,alias_default_427,call_function,alias.default,forward,15,1,1,3,1665,4355,3
+1831,mul_105,call_function,mul.Tensor,forward,15,2,2,1,1666,4351,8
+1832,alias_default_425,call_function,alias.default,forward,15,1,1,2,2,4352,2
+1833,mul_106,call_function,mul.Tensor,forward,15,2,2,1,1670,4350,8
+1834,convert_element_type_361,call_function,convert_element_type.default,forward,15,1,1,1,1671,4349,6
+1835,dtype_cast_137,call_function,dtype_cast.default,forward,15,1,1,1,1,4336,3
+1836,permute_165,call_function,permute.default,forward,15,1,1,1,2,4335,3
+1837,alias_default_428,call_function,alias.default,forward,15,1,1,6,1672,4348,4
+1838,alias_default_429,call_function,alias.default,forward,15,1,1,2,3,4334,3
+1839,einsum_default_105,call_function,einsum.default,forward,15,2,2,1,1677,4332,5
+1840,dtype_cast_138,call_function,dtype_cast.default,forward,15,1,1,1,1,4336,3
+1841,permute_166,call_function,permute.default,forward,15,1,1,1,2,4335,3
+1842,alias_default_430,call_function,alias.default,forward,15,1,1,2,3,4334,3
+1843,einsum_default_106,call_function,einsum.default,forward,15,2,2,1,1677,4332,5
+1844,dtype_cast_139,call_function,dtype_cast.default,forward,15,1,1,1,1,4329,3
+1845,permute_167,call_function,permute.default,forward,15,1,1,1,2,4328,3
+1846,alias_default_431,call_function,alias.default,forward,15,1,1,2,3,4327,3
+1847,einsum_default_107,call_function,einsum.default,forward,15,2,2,1,1677,4325,5
+1848,view_351,call_function,view.default,forward,15,1,1,1,1678,4331,4
+1849,view_352,call_function,view.default,forward,15,1,1,1,1678,4331,4
+1850,view_353,call_function,view.default,forward,15,1,1,1,1678,4324,4
+1851,convert_element_type_368,call_function,convert_element_type.default,forward,15,1,1,1,1679,4330,4
+1852,view_354,call_function,view.default,forward,15,1,1,1,1680,4329,4
+1853,view_as_complex_30,call_function,view_as_complex.default,forward,15,1,1,1,1681,4328,6
+1854,convert_element_type_369,call_function,convert_element_type.default,forward,15,1,1,1,1679,4330,4
+1855,view_355,call_function,view.default,forward,15,1,1,1,1680,4329,4
+1856,view_as_complex_31,call_function,view_as_complex.default,forward,15,1,1,1,1681,4328,6
+1857,view_356,call_function,view.default,forward,15,1,1,1,2,4339,3
+1858,alias_default_432,call_function,alias.default,forward,15,1,1,4,3,4338,3
+1859,mul_107,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8
+1860,view_as_real_30,call_function,view_as_real.default,forward,15,1,1,1,1685,4326,6
+1861,view_357,call_function,view.default,forward,15,1,1,1,1686,4325,6
+1862,mul_108,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8
+1863,view_as_real_31,call_function,view_as_real.default,forward,15,1,1,1,1685,4326,6
+1864,view_358,call_function,view.default,forward,15,1,1,1,1686,4325,6
+1865,convert_element_type_370,call_function,convert_element_type.default,forward,15,1,1,1,1687,4324,6
+1866,convert_element_type_371,call_function,convert_element_type.default,forward,15,1,1,1,1687,4324,6
+1867,permute_168,call_function,permute.default,forward,15,1,1,1,1688,4323,6
+1868,permute_169,call_function,permute.default,forward,15,1,1,1,1688,4323,6
+1869,permute_170,call_function,permute.default,forward,15,1,1,1,1679,4323,4
+1870,alias_default_433,call_function,alias.default,forward,15,1,1,2,1689,4322,4
+1871,alias_default_434,call_function,alias.default,forward,15,1,1,2,1689,4322,4
+1872,alias_default_435,call_function,alias.default,forward,15,1,1,2,1680,4322,4
+1873,_scaled_dot_product_flash_attention_15,call_function,_scaled_dot_product_flash_attention.default,forward,15,3,3,4,1713,4321,2
+1874,getitem_135,call_function,getitem,forward,15,1,1,1,1714,4317,2
+1875,getitem_136,call_function,getitem,forward,15,1,1,1,1714,1714,2
+1876,getitem_141,call_function,getitem,forward,15,1,1,1,1714,1714,1
+1877,getitem_142,call_function,getitem,forward,15,1,1,1,1714,1714,1
+1878,alias_default_436,call_function,alias.default,forward,15,1,1,2,1715,4316,4
+1879,permute_171,call_function,permute.default,forward,15,1,1,1,1716,4315,4
+1880,view_359,call_function,view.default,forward,15,1,1,1,1717,4314,3
+1881,dtype_cast_140,call_function,dtype_cast.default,forward,15,1,1,1,1,4316,3
+1882,permute_172,call_function,permute.default,forward,15,1,1,1,2,4315,3
+1883,alias_default_437,call_function,alias.default,forward,15,1,1,2,1718,4313,4
+1884,alias_default_438,call_function,alias.default,forward,15,1,1,2,3,4314,3
+1885,einsum_default_108,call_function,einsum.default,forward,15,2,2,1,1723,4312,5
+1886,add_76,call_function,add.Tensor,forward,15,2,2,1,1724,4311,10
+1887,dtype_cast_141,call_function,dtype_cast.default,forward,15,1,1,1,1,4300,2
+1888,alias_default_439,call_function,alias.default,forward,15,1,1,3,1725,4310,4
+1889,convert_element_type_374,call_function,convert_element_type.default,forward,15,1,1,1,1726,4308,4
+1890,alias_default_441,call_function,alias.default,forward,15,1,1,2,1727,4307,4
+1891,pow_32,call_function,pow.Tensor_Scalar,forward,15,1,1,1,1728,4306,4
+1892,mean_31,call_function,mean.dim,forward,15,1,1,1,1729,4305,4
+1893,add_77,call_function,add.Scalar,forward,15,1,1,1,1730,4304,3
+1894,rsqrt_31,call_function,rsqrt.default,forward,15,1,1,1,1731,4303,3
+1895,alias_default_442,call_function,alias.default,forward,15,1,1,3,1732,4302,3
+1896,mul_109,call_function,mul.Tensor,forward,15,2,2,1,1733,4298,8
+1897,alias_default_440,call_function,alias.default,forward,15,1,1,2,2,4299,2
+1898,mul_110,call_function,mul.Tensor,forward,15,2,2,1,1737,4297,8
+1899,convert_element_type_375,call_function,convert_element_type.default,forward,15,1,1,1,1738,4296,6
+1900,dtype_cast_142,call_function,dtype_cast.default,forward,15,1,1,1,1,4296,3
+1901,permute_173,call_function,permute.default,forward,15,1,1,1,2,4295,3
+1902,alias_default_443,call_function,alias.default,forward,15,1,1,4,1739,4295,4
+1903,alias_default_444,call_function,alias.default,forward,15,1,1,2,3,4294,3
+1904,einsum_default_109,call_function,einsum.default,forward,15,2,2,1,1744,4292,5
+1905,alias_default_445,call_function,alias.default,forward,15,1,1,2,1745,4291,4
+1906,convert_element_type_378,call_function,convert_element_type.default,forward,15,1,1,1,1746,4279,4
+1907,alias_default_446,call_function,alias.default,forward,15,1,1,2,1747,4278,4
+1908,neg_15,call_function,neg.default,forward,15,1,1,1,1748,4277,8
+1909,exp_15,call_function,exp.default,forward,15,1,1,1,1749,4276,6
+1910,add_78,call_function,add.Tensor,forward,15,1,1,1,1750,4275,4
+1911,div_15,call_function,div.Tensor,forward,15,2,2,1,1751,4274,6
+1912,convert_element_type_379,call_function,convert_element_type.default,forward,15,1,1,1,1752,4273,6
+1913,dtype_cast_143,call_function,dtype_cast.default,forward,15,1,1,1,1,4277,3
+1914,permute_174,call_function,permute.default,forward,15,1,1,1,2,4276,3
+1915,alias_default_448,call_function,alias.default,forward,15,1,1,2,3,4275,3
+1916,einsum_default_110,call_function,einsum.default,forward,15,2,2,1,1744,4273,5
+1917,alias_default_447,call_function,alias.default,forward,15,1,1,2,1753,4272,4
+1918,alias_default_449,call_function,alias.default,forward,15,1,1,2,1745,4272,4
+1919,mul_111,call_function,mul.Tensor,forward,15,2,2,1,1760,4271,8
+1920,dtype_cast_144,call_function,dtype_cast.default,forward,15,1,1,1,1,4273,3
+1921,permute_175,call_function,permute.default,forward,15,1,1,1,2,4272,3
+1922,alias_default_450,call_function,alias.default,forward,15,1,1,2,1761,4270,4
+1923,alias_default_451,call_function,alias.default,forward,15,1,1,2,3,4271,3
+1924,einsum_default_111,call_function,einsum.default,forward,15,2,2,1,1766,4269,5
+1925,add_79,call_function,add.Tensor,forward,15,2,2,1,1767,4268,10
+1926,dtype_cast_145,call_function,dtype_cast.default,forward,16,1,1,1,1,4257,2
+1927,alias_default_452,call_function,alias.default,forward,15,1,1,3,1768,4267,4
+1928,convert_element_type_384,call_function,convert_element_type.default,forward,16,1,1,1,1769,4265,4
+1929,alias_default_454,call_function,alias.default,forward,16,1,1,2,1770,4264,4
+1930,pow_33,call_function,pow.Tensor_Scalar,forward,16,1,1,1,1771,4263,4
+1931,mean_32,call_function,mean.dim,forward,16,1,1,1,1772,4262,4
+1932,add_80,call_function,add.Scalar,forward,16,1,1,1,1773,4261,3
+1933,rsqrt_32,call_function,rsqrt.default,forward,16,1,1,1,1774,4260,3
+1934,alias_default_455,call_function,alias.default,forward,16,1,1,3,1775,4259,3
+1935,mul_112,call_function,mul.Tensor,forward,16,2,2,1,1776,4255,8
+1936,alias_default_453,call_function,alias.default,forward,16,1,1,2,2,4256,2
+1937,mul_113,call_function,mul.Tensor,forward,16,2,2,1,1780,4254,8
+1938,convert_element_type_385,call_function,convert_element_type.default,forward,16,1,1,1,1781,4253,6
+1939,dtype_cast_146,call_function,dtype_cast.default,forward,16,1,1,1,1,4240,3
+1940,permute_176,call_function,permute.default,forward,16,1,1,1,2,4239,3
+1941,alias_default_456,call_function,alias.default,forward,16,1,1,6,1782,4252,4
+1942,alias_default_457,call_function,alias.default,forward,16,1,1,2,3,4238,3
+1943,einsum_default_112,call_function,einsum.default,forward,16,2,2,1,1787,4236,5
+1944,dtype_cast_147,call_function,dtype_cast.default,forward,16,1,1,1,1,4240,3
+1945,permute_177,call_function,permute.default,forward,16,1,1,1,2,4239,3
+1946,alias_default_458,call_function,alias.default,forward,16,1,1,2,3,4238,3
+1947,einsum_default_113,call_function,einsum.default,forward,16,2,2,1,1787,4236,5
+1948,dtype_cast_148,call_function,dtype_cast.default,forward,16,1,1,1,1,4233,3
+1949,permute_178,call_function,permute.default,forward,16,1,1,1,2,4232,3
+1950,alias_default_459,call_function,alias.default,forward,16,1,1,2,3,4231,3
+1951,einsum_default_114,call_function,einsum.default,forward,16,2,2,1,1787,4229,5
+1952,view_374,call_function,view.default,forward,16,1,1,1,1788,4235,4
+1953,view_375,call_function,view.default,forward,16,1,1,1,1788,4235,4
+1954,view_376,call_function,view.default,forward,16,1,1,1,1788,4228,4
+1955,convert_element_type_392,call_function,convert_element_type.default,forward,16,1,1,1,1789,4234,4
+1956,view_377,call_function,view.default,forward,16,1,1,1,1790,4233,4
+1957,view_as_complex_32,call_function,view_as_complex.default,forward,16,1,1,1,1791,4232,6
+1958,convert_element_type_393,call_function,convert_element_type.default,forward,16,1,1,1,1789,4234,4
+1959,view_378,call_function,view.default,forward,16,1,1,1,1790,4233,4
+1960,view_as_complex_33,call_function,view_as_complex.default,forward,16,1,1,1,1791,4232,6
+1961,view_379,call_function,view.default,forward,16,1,1,1,2,4243,3
+1962,alias_default_460,call_function,alias.default,forward,16,1,1,4,3,4242,3
+1963,mul_114,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8
+1964,view_as_real_32,call_function,view_as_real.default,forward,16,1,1,1,1795,4230,6
+1965,view_380,call_function,view.default,forward,16,1,1,1,1796,4229,6
+1966,mul_115,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8
+1967,view_as_real_33,call_function,view_as_real.default,forward,16,1,1,1,1795,4230,6
+1968,view_381,call_function,view.default,forward,16,1,1,1,1796,4229,6
+1969,convert_element_type_394,call_function,convert_element_type.default,forward,16,1,1,1,1797,4228,6
+1970,convert_element_type_395,call_function,convert_element_type.default,forward,16,1,1,1,1797,4228,6
+1971,permute_179,call_function,permute.default,forward,16,1,1,1,1798,4227,6
+1972,permute_180,call_function,permute.default,forward,16,1,1,1,1798,4227,6
+1973,permute_181,call_function,permute.default,forward,16,1,1,1,1789,4227,4
+1974,alias_default_461,call_function,alias.default,forward,16,1,1,2,1799,4226,4
+1975,alias_default_462,call_function,alias.default,forward,16,1,1,2,1799,4226,4
+1976,alias_default_463,call_function,alias.default,forward,16,1,1,2,1790,4226,4
+1977,_scaled_dot_product_flash_attention_16,call_function,_scaled_dot_product_flash_attention.default,forward,16,3,3,4,1823,4225,2
+1978,getitem_144,call_function,getitem,forward,16,1,1,1,1824,4221,2
+1979,getitem_145,call_function,getitem,forward,16,1,1,1,1824,1824,2
+1980,getitem_150,call_function,getitem,forward,16,1,1,1,1824,1824,1
+1981,getitem_151,call_function,getitem,forward,16,1,1,1,1824,1824,1
+1982,alias_default_464,call_function,alias.default,forward,16,1,1,2,1825,4220,4
+1983,permute_182,call_function,permute.default,forward,16,1,1,1,1826,4219,4
+1984,view_382,call_function,view.default,forward,16,1,1,1,1827,4218,3
+1985,dtype_cast_149,call_function,dtype_cast.default,forward,16,1,1,1,1,4220,3
+1986,permute_183,call_function,permute.default,forward,16,1,1,1,2,4219,3
+1987,alias_default_465,call_function,alias.default,forward,16,1,1,2,1828,4217,4
+1988,alias_default_466,call_function,alias.default,forward,16,1,1,2,3,4218,3
+1989,einsum_default_115,call_function,einsum.default,forward,16,2,2,1,1833,4216,5
+1990,add_81,call_function,add.Tensor,forward,16,2,2,1,1834,4215,10
+1991,dtype_cast_150,call_function,dtype_cast.default,forward,16,1,1,1,1,4204,2
+1992,alias_default_467,call_function,alias.default,forward,16,1,1,3,1835,4214,4
+1993,convert_element_type_398,call_function,convert_element_type.default,forward,16,1,1,1,1836,4212,4
+1994,alias_default_469,call_function,alias.default,forward,16,1,1,2,1837,4211,4
+1995,pow_34,call_function,pow.Tensor_Scalar,forward,16,1,1,1,1838,4210,4
+1996,mean_33,call_function,mean.dim,forward,16,1,1,1,1839,4209,4
+1997,add_82,call_function,add.Scalar,forward,16,1,1,1,1840,4208,3
+1998,rsqrt_33,call_function,rsqrt.default,forward,16,1,1,1,1841,4207,3
+1999,alias_default_470,call_function,alias.default,forward,16,1,1,3,1842,4206,3
+2000,mul_116,call_function,mul.Tensor,forward,16,2,2,1,1843,4202,8
+2001,alias_default_468,call_function,alias.default,forward,16,1,1,2,2,4203,2
+2002,mul_117,call_function,mul.Tensor,forward,16,2,2,1,1847,4201,8
+2003,convert_element_type_399,call_function,convert_element_type.default,forward,16,1,1,1,1848,4200,6
+2004,dtype_cast_151,call_function,dtype_cast.default,forward,16,1,1,1,1,4200,3
+2005,permute_184,call_function,permute.default,forward,16,1,1,1,2,4199,3
+2006,alias_default_471,call_function,alias.default,forward,16,1,1,4,1849,4199,4
+2007,alias_default_472,call_function,alias.default,forward,16,1,1,2,3,4198,3
+2008,einsum_default_116,call_function,einsum.default,forward,16,2,2,1,1854,4196,5
+2009,alias_default_473,call_function,alias.default,forward,16,1,1,2,1855,4195,4
+2010,convert_element_type_402,call_function,convert_element_type.default,forward,16,1,1,1,1856,4183,4
+2011,alias_default_474,call_function,alias.default,forward,16,1,1,2,1857,4182,4
+2012,neg_16,call_function,neg.default,forward,16,1,1,1,1858,4181,8
+2013,exp_16,call_function,exp.default,forward,16,1,1,1,1859,4180,6
+2014,add_83,call_function,add.Tensor,forward,16,1,1,1,1860,4179,4
+2015,div_16,call_function,div.Tensor,forward,16,2,2,1,1861,4178,6
+2016,convert_element_type_403,call_function,convert_element_type.default,forward,16,1,1,1,1862,4177,6
+2017,dtype_cast_152,call_function,dtype_cast.default,forward,16,1,1,1,1,4181,3
+2018,permute_185,call_function,permute.default,forward,16,1,1,1,2,4180,3
+2019,alias_default_476,call_function,alias.default,forward,16,1,1,2,3,4179,3
+2020,einsum_default_117,call_function,einsum.default,forward,16,2,2,1,1854,4177,5
+2021,alias_default_475,call_function,alias.default,forward,16,1,1,2,1863,4176,4
+2022,alias_default_477,call_function,alias.default,forward,16,1,1,2,1855,4176,4
+2023,mul_118,call_function,mul.Tensor,forward,16,2,2,1,1870,4175,8
+2024,dtype_cast_153,call_function,dtype_cast.default,forward,16,1,1,1,1,4177,3
+2025,permute_186,call_function,permute.default,forward,16,1,1,1,2,4176,3
+2026,alias_default_478,call_function,alias.default,forward,16,1,1,2,1871,4174,4
+2027,alias_default_479,call_function,alias.default,forward,16,1,1,2,3,4175,3
+2028,einsum_default_118,call_function,einsum.default,forward,16,2,2,1,1876,4173,5
+2029,add_84,call_function,add.Tensor,forward,16,2,2,1,1877,4172,10
+2030,dtype_cast_154,call_function,dtype_cast.default,forward,17,1,1,1,1,4161,2
+2031,alias_default_480,call_function,alias.default,forward,16,1,1,3,1878,4171,4
+2032,convert_element_type_408,call_function,convert_element_type.default,forward,17,1,1,1,1879,4169,4
+2033,alias_default_482,call_function,alias.default,forward,17,1,1,2,1880,4168,4
+2034,pow_35,call_function,pow.Tensor_Scalar,forward,17,1,1,1,1881,4167,4
+2035,mean_34,call_function,mean.dim,forward,17,1,1,1,1882,4166,4
+2036,add_85,call_function,add.Scalar,forward,17,1,1,1,1883,4165,3
+2037,rsqrt_34,call_function,rsqrt.default,forward,17,1,1,1,1884,4164,3
+2038,alias_default_483,call_function,alias.default,forward,17,1,1,3,1885,4163,3
+2039,mul_119,call_function,mul.Tensor,forward,17,2,2,1,1886,4159,8
+2040,alias_default_481,call_function,alias.default,forward,17,1,1,2,2,4160,2
+2041,mul_120,call_function,mul.Tensor,forward,17,2,2,1,1890,4158,8
+2042,convert_element_type_409,call_function,convert_element_type.default,forward,17,1,1,1,1891,4157,6
+2043,dtype_cast_155,call_function,dtype_cast.default,forward,17,1,1,1,1,4144,3
+2044,permute_187,call_function,permute.default,forward,17,1,1,1,2,4143,3
+2045,alias_default_484,call_function,alias.default,forward,17,1,1,6,1892,4156,4
+2046,alias_default_485,call_function,alias.default,forward,17,1,1,2,3,4142,3
+2047,einsum_default_119,call_function,einsum.default,forward,17,2,2,1,1897,4140,5
+2048,dtype_cast_156,call_function,dtype_cast.default,forward,17,1,1,1,1,4144,3
+2049,permute_188,call_function,permute.default,forward,17,1,1,1,2,4143,3
+2050,alias_default_486,call_function,alias.default,forward,17,1,1,2,3,4142,3
+2051,einsum_default_120,call_function,einsum.default,forward,17,2,2,1,1897,4140,5
+2052,dtype_cast_157,call_function,dtype_cast.default,forward,17,1,1,1,1,4137,3
+2053,permute_189,call_function,permute.default,forward,17,1,1,1,2,4136,3
+2054,alias_default_487,call_function,alias.default,forward,17,1,1,2,3,4135,3
+2055,einsum_default_121,call_function,einsum.default,forward,17,2,2,1,1897,4133,5
+2056,view_397,call_function,view.default,forward,17,1,1,1,1898,4139,4
+2057,view_398,call_function,view.default,forward,17,1,1,1,1898,4139,4
+2058,view_399,call_function,view.default,forward,17,1,1,1,1898,4132,4
+2059,convert_element_type_416,call_function,convert_element_type.default,forward,17,1,1,1,1899,4138,4
+2060,view_400,call_function,view.default,forward,17,1,1,1,1900,4137,4
+2061,view_as_complex_34,call_function,view_as_complex.default,forward,17,1,1,1,1901,4136,6
+2062,convert_element_type_417,call_function,convert_element_type.default,forward,17,1,1,1,1899,4138,4
+2063,view_401,call_function,view.default,forward,17,1,1,1,1900,4137,4
+2064,view_as_complex_35,call_function,view_as_complex.default,forward,17,1,1,1,1901,4136,6
+2065,view_402,call_function,view.default,forward,17,1,1,1,2,4147,3
+2066,alias_default_488,call_function,alias.default,forward,17,1,1,4,3,4146,3
+2067,mul_121,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8
+2068,view_as_real_34,call_function,view_as_real.default,forward,17,1,1,1,1905,4134,6
+2069,view_403,call_function,view.default,forward,17,1,1,1,1906,4133,6
+2070,mul_122,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8
+2071,view_as_real_35,call_function,view_as_real.default,forward,17,1,1,1,1905,4134,6
+2072,view_404,call_function,view.default,forward,17,1,1,1,1906,4133,6
+2073,convert_element_type_418,call_function,convert_element_type.default,forward,17,1,1,1,1907,4132,6
+2074,convert_element_type_419,call_function,convert_element_type.default,forward,17,1,1,1,1907,4132,6
+2075,permute_190,call_function,permute.default,forward,17,1,1,1,1908,4131,6
+2076,permute_191,call_function,permute.default,forward,17,1,1,1,1908,4131,6
+2077,permute_192,call_function,permute.default,forward,17,1,1,1,1899,4131,4
+2078,alias_default_489,call_function,alias.default,forward,17,1,1,2,1909,4130,4
+2079,alias_default_490,call_function,alias.default,forward,17,1,1,2,1909,4130,4
+2080,alias_default_491,call_function,alias.default,forward,17,1,1,2,1900,4130,4
+2081,_scaled_dot_product_flash_attention_17,call_function,_scaled_dot_product_flash_attention.default,forward,17,3,3,4,1933,4129,2
+2082,getitem_153,call_function,getitem,forward,17,1,1,1,1934,4125,2
+2083,getitem_154,call_function,getitem,forward,17,1,1,1,1934,1934,2
+2084,getitem_159,call_function,getitem,forward,17,1,1,1,1934,1934,1
+2085,getitem_160,call_function,getitem,forward,17,1,1,1,1934,1934,1
+2086,alias_default_492,call_function,alias.default,forward,17,1,1,2,1935,4124,4
+2087,permute_193,call_function,permute.default,forward,17,1,1,1,1936,4123,4
+2088,view_405,call_function,view.default,forward,17,1,1,1,1937,4122,3
+2089,dtype_cast_158,call_function,dtype_cast.default,forward,17,1,1,1,1,4124,3
+2090,permute_194,call_function,permute.default,forward,17,1,1,1,2,4123,3
+2091,alias_default_493,call_function,alias.default,forward,17,1,1,2,1938,4121,4
+2092,alias_default_494,call_function,alias.default,forward,17,1,1,2,3,4122,3
+2093,einsum_default_122,call_function,einsum.default,forward,17,2,2,1,1943,4120,5
+2094,add_86,call_function,add.Tensor,forward,17,2,2,1,1944,4119,10
+2095,dtype_cast_159,call_function,dtype_cast.default,forward,17,1,1,1,1,4108,2
+2096,alias_default_495,call_function,alias.default,forward,17,1,1,3,1945,4118,4
+2097,convert_element_type_422,call_function,convert_element_type.default,forward,17,1,1,1,1946,4116,4
+2098,alias_default_497,call_function,alias.default,forward,17,1,1,2,1947,4115,4
+2099,pow_36,call_function,pow.Tensor_Scalar,forward,17,1,1,1,1948,4114,4
+2100,mean_35,call_function,mean.dim,forward,17,1,1,1,1949,4113,4
+2101,add_87,call_function,add.Scalar,forward,17,1,1,1,1950,4112,3
+2102,rsqrt_35,call_function,rsqrt.default,forward,17,1,1,1,1951,4111,3
+2103,alias_default_498,call_function,alias.default,forward,17,1,1,3,1952,4110,3
+2104,mul_123,call_function,mul.Tensor,forward,17,2,2,1,1953,4106,8
+2105,alias_default_496,call_function,alias.default,forward,17,1,1,2,2,4107,2
+2106,mul_124,call_function,mul.Tensor,forward,17,2,2,1,1957,4105,8
+2107,convert_element_type_423,call_function,convert_element_type.default,forward,17,1,1,1,1958,4104,6
+2108,dtype_cast_160,call_function,dtype_cast.default,forward,17,1,1,1,1,4104,3
+2109,permute_195,call_function,permute.default,forward,17,1,1,1,2,4103,3
+2110,alias_default_499,call_function,alias.default,forward,17,1,1,4,1959,4103,4
+2111,alias_default_500,call_function,alias.default,forward,17,1,1,2,3,4102,3
+2112,einsum_default_123,call_function,einsum.default,forward,17,2,2,1,1964,4100,5
+2113,alias_default_501,call_function,alias.default,forward,17,1,1,2,1965,4099,4
+2114,convert_element_type_426,call_function,convert_element_type.default,forward,17,1,1,1,1966,4087,4
+2115,alias_default_502,call_function,alias.default,forward,17,1,1,2,1967,4086,4
+2116,neg_17,call_function,neg.default,forward,17,1,1,1,1968,4085,8
+2117,exp_17,call_function,exp.default,forward,17,1,1,1,1969,4084,6
+2118,add_88,call_function,add.Tensor,forward,17,1,1,1,1970,4083,4
+2119,div_17,call_function,div.Tensor,forward,17,2,2,1,1971,4082,6
+2120,convert_element_type_427,call_function,convert_element_type.default,forward,17,1,1,1,1972,4081,6
+2121,dtype_cast_161,call_function,dtype_cast.default,forward,17,1,1,1,1,4085,3
+2122,permute_196,call_function,permute.default,forward,17,1,1,1,2,4084,3
+2123,alias_default_504,call_function,alias.default,forward,17,1,1,2,3,4083,3
+2124,einsum_default_124,call_function,einsum.default,forward,17,2,2,1,1964,4081,5
+2125,alias_default_503,call_function,alias.default,forward,17,1,1,2,1973,4080,4
+2126,alias_default_505,call_function,alias.default,forward,17,1,1,2,1965,4080,4
+2127,mul_125,call_function,mul.Tensor,forward,17,2,2,1,1980,4079,8
+2128,dtype_cast_162,call_function,dtype_cast.default,forward,17,1,1,1,1,4081,3
+2129,permute_197,call_function,permute.default,forward,17,1,1,1,2,4080,3
+2130,alias_default_506,call_function,alias.default,forward,17,1,1,2,1981,4078,4
+2131,alias_default_507,call_function,alias.default,forward,17,1,1,2,3,4079,3
+2132,einsum_default_125,call_function,einsum.default,forward,17,2,2,1,1986,4077,5
+2133,add_89,call_function,add.Tensor,forward,17,2,2,1,1987,4076,10
+2134,dtype_cast_163,call_function,dtype_cast.default,forward,18,1,1,1,1,4065,2
+2135,alias_default_508,call_function,alias.default,forward,17,1,1,3,1988,4075,4
+2136,convert_element_type_432,call_function,convert_element_type.default,forward,18,1,1,1,1989,4073,4
+2137,alias_default_510,call_function,alias.default,forward,18,1,1,2,1990,4072,4
+2138,pow_37,call_function,pow.Tensor_Scalar,forward,18,1,1,1,1991,4071,4
+2139,mean_36,call_function,mean.dim,forward,18,1,1,1,1992,4070,4
+2140,add_90,call_function,add.Scalar,forward,18,1,1,1,1993,4069,3
+2141,rsqrt_36,call_function,rsqrt.default,forward,18,1,1,1,1994,4068,3
+2142,alias_default_511,call_function,alias.default,forward,18,1,1,3,1995,4067,3
+2143,mul_126,call_function,mul.Tensor,forward,18,2,2,1,1996,4063,8
+2144,alias_default_509,call_function,alias.default,forward,18,1,1,2,2,4064,2
+2145,mul_127,call_function,mul.Tensor,forward,18,2,2,1,2000,4062,8
+2146,convert_element_type_433,call_function,convert_element_type.default,forward,18,1,1,1,2001,4061,6
+2147,dtype_cast_164,call_function,dtype_cast.default,forward,18,1,1,1,1,4048,3
+2148,permute_198,call_function,permute.default,forward,18,1,1,1,2,4047,3
+2149,alias_default_512,call_function,alias.default,forward,18,1,1,6,2002,4060,4
+2150,alias_default_513,call_function,alias.default,forward,18,1,1,2,3,4046,3
+2151,einsum_default_126,call_function,einsum.default,forward,18,2,2,1,2007,4044,5
+2152,dtype_cast_165,call_function,dtype_cast.default,forward,18,1,1,1,1,4048,3
+2153,permute_199,call_function,permute.default,forward,18,1,1,1,2,4047,3
+2154,alias_default_514,call_function,alias.default,forward,18,1,1,2,3,4046,3
+2155,einsum_default_127,call_function,einsum.default,forward,18,2,2,1,2007,4044,5
+2156,dtype_cast_166,call_function,dtype_cast.default,forward,18,1,1,1,1,4041,3
+2157,permute_200,call_function,permute.default,forward,18,1,1,1,2,4040,3
+2158,alias_default_515,call_function,alias.default,forward,18,1,1,2,3,4039,3
+2159,einsum_default_128,call_function,einsum.default,forward,18,2,2,1,2007,4037,5
+2160,view_420,call_function,view.default,forward,18,1,1,1,2008,4043,4
+2161,view_421,call_function,view.default,forward,18,1,1,1,2008,4043,4
+2162,view_422,call_function,view.default,forward,18,1,1,1,2008,4036,4
+2163,convert_element_type_440,call_function,convert_element_type.default,forward,18,1,1,1,2009,4042,4
+2164,view_423,call_function,view.default,forward,18,1,1,1,2010,4041,4
+2165,view_as_complex_36,call_function,view_as_complex.default,forward,18,1,1,1,2011,4040,6
+2166,convert_element_type_441,call_function,convert_element_type.default,forward,18,1,1,1,2009,4042,4
+2167,view_424,call_function,view.default,forward,18,1,1,1,2010,4041,4
+2168,view_as_complex_37,call_function,view_as_complex.default,forward,18,1,1,1,2011,4040,6
+2169,view_425,call_function,view.default,forward,18,1,1,1,2,4051,3
+2170,alias_default_516,call_function,alias.default,forward,18,1,1,4,3,4050,3
+2171,mul_128,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8
+2172,view_as_real_36,call_function,view_as_real.default,forward,18,1,1,1,2015,4038,6
+2173,view_426,call_function,view.default,forward,18,1,1,1,2016,4037,6
+2174,mul_129,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8
+2175,view_as_real_37,call_function,view_as_real.default,forward,18,1,1,1,2015,4038,6
+2176,view_427,call_function,view.default,forward,18,1,1,1,2016,4037,6
+2177,convert_element_type_442,call_function,convert_element_type.default,forward,18,1,1,1,2017,4036,6
+2178,convert_element_type_443,call_function,convert_element_type.default,forward,18,1,1,1,2017,4036,6
+2179,permute_201,call_function,permute.default,forward,18,1,1,1,2018,4035,6
+2180,permute_202,call_function,permute.default,forward,18,1,1,1,2018,4035,6
+2181,permute_203,call_function,permute.default,forward,18,1,1,1,2009,4035,4
+2182,alias_default_517,call_function,alias.default,forward,18,1,1,2,2019,4034,4
+2183,alias_default_518,call_function,alias.default,forward,18,1,1,2,2019,4034,4
+2184,alias_default_519,call_function,alias.default,forward,18,1,1,2,2010,4034,4
+2185,_scaled_dot_product_flash_attention_18,call_function,_scaled_dot_product_flash_attention.default,forward,18,3,3,4,2043,4033,2
+2186,getitem_162,call_function,getitem,forward,18,1,1,1,2044,4029,2
+2187,getitem_163,call_function,getitem,forward,18,1,1,1,2044,2044,2
+2188,getitem_168,call_function,getitem,forward,18,1,1,1,2044,2044,1
+2189,getitem_169,call_function,getitem,forward,18,1,1,1,2044,2044,1
+2190,alias_default_520,call_function,alias.default,forward,18,1,1,2,2045,4028,4
+2191,permute_204,call_function,permute.default,forward,18,1,1,1,2046,4027,4
+2192,view_428,call_function,view.default,forward,18,1,1,1,2047,4026,3
+2193,dtype_cast_167,call_function,dtype_cast.default,forward,18,1,1,1,1,4028,3
+2194,permute_205,call_function,permute.default,forward,18,1,1,1,2,4027,3
+2195,alias_default_521,call_function,alias.default,forward,18,1,1,2,2048,4025,4
+2196,alias_default_522,call_function,alias.default,forward,18,1,1,2,3,4026,3
+2197,einsum_default_129,call_function,einsum.default,forward,18,2,2,1,2053,4024,5
+2198,add_91,call_function,add.Tensor,forward,18,2,2,1,2054,4023,10
+2199,dtype_cast_168,call_function,dtype_cast.default,forward,18,1,1,1,1,4012,2
+2200,alias_default_523,call_function,alias.default,forward,18,1,1,3,2055,4022,4
+2201,convert_element_type_446,call_function,convert_element_type.default,forward,18,1,1,1,2056,4020,4
+2202,alias_default_525,call_function,alias.default,forward,18,1,1,2,2057,4019,4
+2203,pow_38,call_function,pow.Tensor_Scalar,forward,18,1,1,1,2058,4018,4
+2204,mean_37,call_function,mean.dim,forward,18,1,1,1,2059,4017,4
+2205,add_92,call_function,add.Scalar,forward,18,1,1,1,2060,4016,3
+2206,rsqrt_37,call_function,rsqrt.default,forward,18,1,1,1,2061,4015,3
+2207,alias_default_526,call_function,alias.default,forward,18,1,1,3,2062,4014,3
+2208,mul_130,call_function,mul.Tensor,forward,18,2,2,1,2063,4010,8
+2209,alias_default_524,call_function,alias.default,forward,18,1,1,2,2,4011,2
+2210,mul_131,call_function,mul.Tensor,forward,18,2,2,1,2067,4009,8
+2211,convert_element_type_447,call_function,convert_element_type.default,forward,18,1,1,1,2068,4008,6
+2212,dtype_cast_169,call_function,dtype_cast.default,forward,18,1,1,1,1,4008,3
+2213,permute_206,call_function,permute.default,forward,18,1,1,1,2,4007,3
+2214,alias_default_527,call_function,alias.default,forward,18,1,1,4,2069,4007,4
+2215,alias_default_528,call_function,alias.default,forward,18,1,1,2,3,4006,3
+2216,einsum_default_130,call_function,einsum.default,forward,18,2,2,1,2074,4004,5
+2217,alias_default_529,call_function,alias.default,forward,18,1,1,2,2075,4003,4
+2218,convert_element_type_450,call_function,convert_element_type.default,forward,18,1,1,1,2076,3991,4
+2219,alias_default_530,call_function,alias.default,forward,18,1,1,2,2077,3990,4
+2220,neg_18,call_function,neg.default,forward,18,1,1,1,2078,3989,8
+2221,exp_18,call_function,exp.default,forward,18,1,1,1,2079,3988,6
+2222,add_93,call_function,add.Tensor,forward,18,1,1,1,2080,3987,4
+2223,div_18,call_function,div.Tensor,forward,18,2,2,1,2081,3986,6
+2224,convert_element_type_451,call_function,convert_element_type.default,forward,18,1,1,1,2082,3985,6
+2225,dtype_cast_170,call_function,dtype_cast.default,forward,18,1,1,1,1,3989,3
+2226,permute_207,call_function,permute.default,forward,18,1,1,1,2,3988,3
+2227,alias_default_532,call_function,alias.default,forward,18,1,1,2,3,3987,3
+2228,einsum_default_131,call_function,einsum.default,forward,18,2,2,1,2074,3985,5
+2229,alias_default_531,call_function,alias.default,forward,18,1,1,2,2083,3984,4
+2230,alias_default_533,call_function,alias.default,forward,18,1,1,2,2075,3984,4
+2231,mul_132,call_function,mul.Tensor,forward,18,2,2,1,2090,3983,8
+2232,dtype_cast_171,call_function,dtype_cast.default,forward,18,1,1,1,1,3985,3
+2233,permute_208,call_function,permute.default,forward,18,1,1,1,2,3984,3
+2234,alias_default_534,call_function,alias.default,forward,18,1,1,2,2091,3982,4
+2235,alias_default_535,call_function,alias.default,forward,18,1,1,2,3,3983,3
+2236,einsum_default_132,call_function,einsum.default,forward,18,2,2,1,2096,3981,5
+2237,add_94,call_function,add.Tensor,forward,18,2,2,1,2097,3980,10
+2238,dtype_cast_172,call_function,dtype_cast.default,forward,19,1,1,1,1,3969,2
+2239,alias_default_536,call_function,alias.default,forward,18,1,1,3,2098,3979,4
+2240,convert_element_type_456,call_function,convert_element_type.default,forward,19,1,1,1,2099,3977,4
+2241,alias_default_538,call_function,alias.default,forward,19,1,1,2,2100,3976,4
+2242,pow_39,call_function,pow.Tensor_Scalar,forward,19,1,1,1,2101,3975,4
+2243,mean_38,call_function,mean.dim,forward,19,1,1,1,2102,3974,4
+2244,add_95,call_function,add.Scalar,forward,19,1,1,1,2103,3973,3
+2245,rsqrt_38,call_function,rsqrt.default,forward,19,1,1,1,2104,3972,3
+2246,alias_default_539,call_function,alias.default,forward,19,1,1,3,2105,3971,3
+2247,mul_133,call_function,mul.Tensor,forward,19,2,2,1,2106,3967,8
+2248,alias_default_537,call_function,alias.default,forward,19,1,1,2,2,3968,2
+2249,mul_134,call_function,mul.Tensor,forward,19,2,2,1,2110,3966,8
+2250,convert_element_type_457,call_function,convert_element_type.default,forward,19,1,1,1,2111,3965,6
+2251,dtype_cast_173,call_function,dtype_cast.default,forward,19,1,1,1,1,3952,3
+2252,permute_209,call_function,permute.default,forward,19,1,1,1,2,3951,3
+2253,alias_default_540,call_function,alias.default,forward,19,1,1,6,2112,3964,4
+2254,alias_default_541,call_function,alias.default,forward,19,1,1,2,3,3950,3
+2255,einsum_default_133,call_function,einsum.default,forward,19,2,2,1,2117,3948,5
+2256,dtype_cast_174,call_function,dtype_cast.default,forward,19,1,1,1,1,3952,3
+2257,permute_210,call_function,permute.default,forward,19,1,1,1,2,3951,3
+2258,alias_default_542,call_function,alias.default,forward,19,1,1,2,3,3950,3
+2259,einsum_default_134,call_function,einsum.default,forward,19,2,2,1,2117,3948,5
+2260,dtype_cast_175,call_function,dtype_cast.default,forward,19,1,1,1,1,3945,3
+2261,permute_211,call_function,permute.default,forward,19,1,1,1,2,3944,3
+2262,alias_default_543,call_function,alias.default,forward,19,1,1,2,3,3943,3
+2263,einsum_default_135,call_function,einsum.default,forward,19,2,2,1,2117,3941,5
+2264,view_443,call_function,view.default,forward,19,1,1,1,2118,3947,4
+2265,view_444,call_function,view.default,forward,19,1,1,1,2118,3947,4
+2266,view_445,call_function,view.default,forward,19,1,1,1,2118,3940,4
+2267,convert_element_type_464,call_function,convert_element_type.default,forward,19,1,1,1,2119,3946,4
+2268,view_446,call_function,view.default,forward,19,1,1,1,2120,3945,4
+2269,view_as_complex_38,call_function,view_as_complex.default,forward,19,1,1,1,2121,3944,6
+2270,convert_element_type_465,call_function,convert_element_type.default,forward,19,1,1,1,2119,3946,4
+2271,view_447,call_function,view.default,forward,19,1,1,1,2120,3945,4
+2272,view_as_complex_39,call_function,view_as_complex.default,forward,19,1,1,1,2121,3944,6
+2273,view_448,call_function,view.default,forward,19,1,1,1,2,3955,3
+2274,alias_default_544,call_function,alias.default,forward,19,1,1,4,3,3954,3
+2275,mul_135,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8
+2276,view_as_real_38,call_function,view_as_real.default,forward,19,1,1,1,2125,3942,6
+2277,view_449,call_function,view.default,forward,19,1,1,1,2126,3941,6
+2278,mul_136,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8
+2279,view_as_real_39,call_function,view_as_real.default,forward,19,1,1,1,2125,3942,6
+2280,view_450,call_function,view.default,forward,19,1,1,1,2126,3941,6
+2281,convert_element_type_466,call_function,convert_element_type.default,forward,19,1,1,1,2127,3940,6
+2282,convert_element_type_467,call_function,convert_element_type.default,forward,19,1,1,1,2127,3940,6
+2283,permute_212,call_function,permute.default,forward,19,1,1,1,2128,3939,6
+2284,permute_213,call_function,permute.default,forward,19,1,1,1,2128,3939,6
+2285,permute_214,call_function,permute.default,forward,19,1,1,1,2119,3939,4
+2286,alias_default_545,call_function,alias.default,forward,19,1,1,2,2129,3938,4
+2287,alias_default_546,call_function,alias.default,forward,19,1,1,2,2129,3938,4
+2288,alias_default_547,call_function,alias.default,forward,19,1,1,2,2120,3938,4
+2289,_scaled_dot_product_flash_attention_19,call_function,_scaled_dot_product_flash_attention.default,forward,19,3,3,4,2153,3937,2
+2290,getitem_171,call_function,getitem,forward,19,1,1,1,2154,3933,2
+2291,getitem_172,call_function,getitem,forward,19,1,1,1,2154,2154,2
+2292,getitem_177,call_function,getitem,forward,19,1,1,1,2154,2154,1
+2293,getitem_178,call_function,getitem,forward,19,1,1,1,2154,2154,1
+2294,alias_default_548,call_function,alias.default,forward,19,1,1,2,2155,3932,4
+2295,permute_215,call_function,permute.default,forward,19,1,1,1,2156,3931,4
+2296,view_451,call_function,view.default,forward,19,1,1,1,2157,3930,3
+2297,dtype_cast_176,call_function,dtype_cast.default,forward,19,1,1,1,1,3932,3
+2298,permute_216,call_function,permute.default,forward,19,1,1,1,2,3931,3
+2299,alias_default_549,call_function,alias.default,forward,19,1,1,2,2158,3929,4
+2300,alias_default_550,call_function,alias.default,forward,19,1,1,2,3,3930,3
+2301,einsum_default_136,call_function,einsum.default,forward,19,2,2,1,2163,3928,5
+2302,add_96,call_function,add.Tensor,forward,19,2,2,1,2164,3927,10
+2303,dtype_cast_177,call_function,dtype_cast.default,forward,19,1,1,1,1,3916,2
+2304,alias_default_551,call_function,alias.default,forward,19,1,1,3,2165,3926,4
+2305,convert_element_type_470,call_function,convert_element_type.default,forward,19,1,1,1,2166,3924,4
+2306,alias_default_553,call_function,alias.default,forward,19,1,1,2,2167,3923,4
+2307,pow_40,call_function,pow.Tensor_Scalar,forward,19,1,1,1,2168,3922,4
+2308,mean_39,call_function,mean.dim,forward,19,1,1,1,2169,3921,4
+2309,add_97,call_function,add.Scalar,forward,19,1,1,1,2170,3920,3
+2310,rsqrt_39,call_function,rsqrt.default,forward,19,1,1,1,2171,3919,3
+2311,alias_default_554,call_function,alias.default,forward,19,1,1,3,2172,3918,3
+2312,mul_137,call_function,mul.Tensor,forward,19,2,2,1,2173,3914,8
+2313,alias_default_552,call_function,alias.default,forward,19,1,1,2,2,3915,2
+2314,mul_138,call_function,mul.Tensor,forward,19,2,2,1,2177,3913,8
+2315,convert_element_type_471,call_function,convert_element_type.default,forward,19,1,1,1,2178,3912,6
+2316,dtype_cast_178,call_function,dtype_cast.default,forward,19,1,1,1,1,3912,3
+2317,permute_217,call_function,permute.default,forward,19,1,1,1,2,3911,3
+2318,alias_default_555,call_function,alias.default,forward,19,1,1,4,2179,3911,4
+2319,alias_default_556,call_function,alias.default,forward,19,1,1,2,3,3910,3
+2320,einsum_default_137,call_function,einsum.default,forward,19,2,2,1,2184,3908,5
+2321,alias_default_557,call_function,alias.default,forward,19,1,1,2,2185,3907,4
+2322,convert_element_type_474,call_function,convert_element_type.default,forward,19,1,1,1,2186,3895,4
+2323,alias_default_558,call_function,alias.default,forward,19,1,1,2,2187,3894,4
+2324,neg_19,call_function,neg.default,forward,19,1,1,1,2188,3893,8
+2325,exp_19,call_function,exp.default,forward,19,1,1,1,2189,3892,6
+2326,add_98,call_function,add.Tensor,forward,19,1,1,1,2190,3891,4
+2327,div_19,call_function,div.Tensor,forward,19,2,2,1,2191,3890,6
+2328,convert_element_type_475,call_function,convert_element_type.default,forward,19,1,1,1,2192,3889,6
+2329,dtype_cast_179,call_function,dtype_cast.default,forward,19,1,1,1,1,3893,3
+2330,permute_218,call_function,permute.default,forward,19,1,1,1,2,3892,3
+2331,alias_default_560,call_function,alias.default,forward,19,1,1,2,3,3891,3
+2332,einsum_default_138,call_function,einsum.default,forward,19,2,2,1,2184,3889,5
+2333,alias_default_559,call_function,alias.default,forward,19,1,1,2,2193,3888,4
+2334,alias_default_561,call_function,alias.default,forward,19,1,1,2,2185,3888,4
+2335,mul_139,call_function,mul.Tensor,forward,19,2,2,1,2200,3887,8
+2336,dtype_cast_180,call_function,dtype_cast.default,forward,19,1,1,1,1,3889,3
+2337,permute_219,call_function,permute.default,forward,19,1,1,1,2,3888,3
+2338,alias_default_562,call_function,alias.default,forward,19,1,1,2,2201,3886,4
+2339,alias_default_563,call_function,alias.default,forward,19,1,1,2,3,3887,3
+2340,einsum_default_139,call_function,einsum.default,forward,19,2,2,1,2206,3885,5
+2341,add_99,call_function,add.Tensor,forward,19,2,2,1,2207,3884,10
+2342,dtype_cast_181,call_function,dtype_cast.default,forward,20,1,1,1,1,3873,2
+2343,alias_default_564,call_function,alias.default,forward,19,1,1,3,2208,3883,4
+2344,convert_element_type_480,call_function,convert_element_type.default,forward,20,1,1,1,2209,3881,4
+2345,alias_default_566,call_function,alias.default,forward,20,1,1,2,2210,3880,4
+2346,pow_41,call_function,pow.Tensor_Scalar,forward,20,1,1,1,2211,3879,4
+2347,mean_40,call_function,mean.dim,forward,20,1,1,1,2212,3878,4
+2348,add_100,call_function,add.Scalar,forward,20,1,1,1,2213,3877,3
+2349,rsqrt_40,call_function,rsqrt.default,forward,20,1,1,1,2214,3876,3
+2350,alias_default_567,call_function,alias.default,forward,20,1,1,3,2215,3875,3
+2351,mul_140,call_function,mul.Tensor,forward,20,2,2,1,2216,3871,8
+2352,alias_default_565,call_function,alias.default,forward,20,1,1,2,2,3872,2
+2353,mul_141,call_function,mul.Tensor,forward,20,2,2,1,2220,3870,8
+2354,convert_element_type_481,call_function,convert_element_type.default,forward,20,1,1,1,2221,3869,6
+2355,dtype_cast_182,call_function,dtype_cast.default,forward,20,1,1,1,1,3856,3
+2356,permute_220,call_function,permute.default,forward,20,1,1,1,2,3855,3
+2357,alias_default_568,call_function,alias.default,forward,20,1,1,6,2222,3868,4
+2358,alias_default_569,call_function,alias.default,forward,20,1,1,2,3,3854,3
+2359,einsum_default_140,call_function,einsum.default,forward,20,2,2,1,2227,3852,5
+2360,dtype_cast_183,call_function,dtype_cast.default,forward,20,1,1,1,1,3856,3
+2361,permute_221,call_function,permute.default,forward,20,1,1,1,2,3855,3
+2362,alias_default_570,call_function,alias.default,forward,20,1,1,2,3,3854,3
+2363,einsum_default_141,call_function,einsum.default,forward,20,2,2,1,2227,3852,5
+2364,dtype_cast_184,call_function,dtype_cast.default,forward,20,1,1,1,1,3849,3
+2365,permute_222,call_function,permute.default,forward,20,1,1,1,2,3848,3
+2366,alias_default_571,call_function,alias.default,forward,20,1,1,2,3,3847,3
+2367,einsum_default_142,call_function,einsum.default,forward,20,2,2,1,2227,3845,5
+2368,view_466,call_function,view.default,forward,20,1,1,1,2228,3851,4
+2369,view_467,call_function,view.default,forward,20,1,1,1,2228,3851,4
+2370,view_468,call_function,view.default,forward,20,1,1,1,2228,3844,4
+2371,convert_element_type_488,call_function,convert_element_type.default,forward,20,1,1,1,2229,3850,4
+2372,view_469,call_function,view.default,forward,20,1,1,1,2230,3849,4
+2373,view_as_complex_40,call_function,view_as_complex.default,forward,20,1,1,1,2231,3848,6
+2374,convert_element_type_489,call_function,convert_element_type.default,forward,20,1,1,1,2229,3850,4
+2375,view_470,call_function,view.default,forward,20,1,1,1,2230,3849,4
+2376,view_as_complex_41,call_function,view_as_complex.default,forward,20,1,1,1,2231,3848,6
+2377,view_471,call_function,view.default,forward,20,1,1,1,2,3859,3
+2378,alias_default_572,call_function,alias.default,forward,20,1,1,4,3,3858,3
+2379,mul_142,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8
+2380,view_as_real_40,call_function,view_as_real.default,forward,20,1,1,1,2235,3846,6
+2381,view_472,call_function,view.default,forward,20,1,1,1,2236,3845,6
+2382,mul_143,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8
+2383,view_as_real_41,call_function,view_as_real.default,forward,20,1,1,1,2235,3846,6
+2384,view_473,call_function,view.default,forward,20,1,1,1,2236,3845,6
+2385,convert_element_type_490,call_function,convert_element_type.default,forward,20,1,1,1,2237,3844,6
+2386,convert_element_type_491,call_function,convert_element_type.default,forward,20,1,1,1,2237,3844,6
+2387,permute_223,call_function,permute.default,forward,20,1,1,1,2238,3843,6
+2388,permute_224,call_function,permute.default,forward,20,1,1,1,2238,3843,6
+2389,permute_225,call_function,permute.default,forward,20,1,1,1,2229,3843,4
+2390,alias_default_573,call_function,alias.default,forward,20,1,1,2,2239,3842,4
+2391,alias_default_574,call_function,alias.default,forward,20,1,1,2,2239,3842,4
+2392,alias_default_575,call_function,alias.default,forward,20,1,1,2,2230,3842,4
+2393,_scaled_dot_product_flash_attention_20,call_function,_scaled_dot_product_flash_attention.default,forward,20,3,3,4,2263,3841,2
+2394,getitem_180,call_function,getitem,forward,20,1,1,1,2264,3837,2
+2395,getitem_181,call_function,getitem,forward,20,1,1,1,2264,2264,2
+2396,getitem_186,call_function,getitem,forward,20,1,1,1,2264,2264,1
+2397,getitem_187,call_function,getitem,forward,20,1,1,1,2264,2264,1
+2398,alias_default_576,call_function,alias.default,forward,20,1,1,2,2265,3836,4
+2399,permute_226,call_function,permute.default,forward,20,1,1,1,2266,3835,4
+2400,view_474,call_function,view.default,forward,20,1,1,1,2267,3834,3
+2401,dtype_cast_185,call_function,dtype_cast.default,forward,20,1,1,1,1,3836,3
+2402,permute_227,call_function,permute.default,forward,20,1,1,1,2,3835,3
+2403,alias_default_577,call_function,alias.default,forward,20,1,1,2,2268,3833,4
+2404,alias_default_578,call_function,alias.default,forward,20,1,1,2,3,3834,3
+2405,einsum_default_143,call_function,einsum.default,forward,20,2,2,1,2273,3832,5
+2406,add_101,call_function,add.Tensor,forward,20,2,2,1,2274,3831,10
+2407,dtype_cast_186,call_function,dtype_cast.default,forward,20,1,1,1,1,3820,2
+2408,alias_default_579,call_function,alias.default,forward,20,1,1,3,2275,3830,4
+2409,convert_element_type_494,call_function,convert_element_type.default,forward,20,1,1,1,2276,3828,4
+2410,alias_default_581,call_function,alias.default,forward,20,1,1,2,2277,3827,4
+2411,pow_42,call_function,pow.Tensor_Scalar,forward,20,1,1,1,2278,3826,4
+2412,mean_41,call_function,mean.dim,forward,20,1,1,1,2279,3825,4
+2413,add_102,call_function,add.Scalar,forward,20,1,1,1,2280,3824,3
+2414,rsqrt_41,call_function,rsqrt.default,forward,20,1,1,1,2281,3823,3
+2415,alias_default_582,call_function,alias.default,forward,20,1,1,3,2282,3822,3
+2416,mul_144,call_function,mul.Tensor,forward,20,2,2,1,2283,3818,8
+2417,alias_default_580,call_function,alias.default,forward,20,1,1,2,2,3819,2
+2418,mul_145,call_function,mul.Tensor,forward,20,2,2,1,2287,3817,8
+2419,convert_element_type_495,call_function,convert_element_type.default,forward,20,1,1,1,2288,3816,6
+2420,dtype_cast_187,call_function,dtype_cast.default,forward,20,1,1,1,1,3816,3
+2421,permute_228,call_function,permute.default,forward,20,1,1,1,2,3815,3
+2422,alias_default_583,call_function,alias.default,forward,20,1,1,4,2289,3815,4
+2423,alias_default_584,call_function,alias.default,forward,20,1,1,2,3,3814,3
+2424,einsum_default_144,call_function,einsum.default,forward,20,2,2,1,2294,3812,5
+2425,alias_default_585,call_function,alias.default,forward,20,1,1,2,2295,3811,4
+2426,convert_element_type_498,call_function,convert_element_type.default,forward,20,1,1,1,2296,3799,4
+2427,alias_default_586,call_function,alias.default,forward,20,1,1,2,2297,3798,4
+2428,neg_20,call_function,neg.default,forward,20,1,1,1,2298,3797,8
+2429,exp_20,call_function,exp.default,forward,20,1,1,1,2299,3796,6
+2430,add_103,call_function,add.Tensor,forward,20,1,1,1,2300,3795,4
+2431,div_20,call_function,div.Tensor,forward,20,2,2,1,2301,3794,6
+2432,convert_element_type_499,call_function,convert_element_type.default,forward,20,1,1,1,2302,3793,6
+2433,dtype_cast_188,call_function,dtype_cast.default,forward,20,1,1,1,1,3797,3
+2434,permute_229,call_function,permute.default,forward,20,1,1,1,2,3796,3
+2435,alias_default_588,call_function,alias.default,forward,20,1,1,2,3,3795,3
+2436,einsum_default_145,call_function,einsum.default,forward,20,2,2,1,2294,3793,5
+2437,alias_default_587,call_function,alias.default,forward,20,1,1,2,2303,3792,4
+2438,alias_default_589,call_function,alias.default,forward,20,1,1,2,2295,3792,4
+2439,mul_146,call_function,mul.Tensor,forward,20,2,2,1,2310,3791,8
+2440,dtype_cast_189,call_function,dtype_cast.default,forward,20,1,1,1,1,3793,3
+2441,permute_230,call_function,permute.default,forward,20,1,1,1,2,3792,3
+2442,alias_default_590,call_function,alias.default,forward,20,1,1,2,2311,3790,4
+2443,alias_default_591,call_function,alias.default,forward,20,1,1,2,3,3791,3
+2444,einsum_default_146,call_function,einsum.default,forward,20,2,2,1,2316,3789,5
+2445,add_104,call_function,add.Tensor,forward,20,2,2,1,2317,3788,10
+2446,dtype_cast_190,call_function,dtype_cast.default,forward,21,1,1,1,1,3777,2
+2447,alias_default_592,call_function,alias.default,forward,20,1,1,3,2318,3787,4
+2448,convert_element_type_504,call_function,convert_element_type.default,forward,21,1,1,1,2319,3785,4
+2449,alias_default_594,call_function,alias.default,forward,21,1,1,2,2320,3784,4
+2450,pow_43,call_function,pow.Tensor_Scalar,forward,21,1,1,1,2321,3783,4
+2451,mean_42,call_function,mean.dim,forward,21,1,1,1,2322,3782,4
+2452,add_105,call_function,add.Scalar,forward,21,1,1,1,2323,3781,3
+2453,rsqrt_42,call_function,rsqrt.default,forward,21,1,1,1,2324,3780,3
+2454,alias_default_595,call_function,alias.default,forward,21,1,1,3,2325,3779,3
+2455,mul_147,call_function,mul.Tensor,forward,21,2,2,1,2326,3775,8
+2456,alias_default_593,call_function,alias.default,forward,21,1,1,2,2,3776,2
+2457,mul_148,call_function,mul.Tensor,forward,21,2,2,1,2330,3774,8
+2458,convert_element_type_505,call_function,convert_element_type.default,forward,21,1,1,1,2331,3773,6
+2459,dtype_cast_191,call_function,dtype_cast.default,forward,21,1,1,1,1,3760,3
+2460,permute_231,call_function,permute.default,forward,21,1,1,1,2,3759,3
+2461,alias_default_596,call_function,alias.default,forward,21,1,1,6,2332,3772,4
+2462,alias_default_597,call_function,alias.default,forward,21,1,1,2,3,3758,3
+2463,einsum_default_147,call_function,einsum.default,forward,21,2,2,1,2337,3756,5
+2464,dtype_cast_192,call_function,dtype_cast.default,forward,21,1,1,1,1,3760,3
+2465,permute_232,call_function,permute.default,forward,21,1,1,1,2,3759,3
+2466,alias_default_598,call_function,alias.default,forward,21,1,1,2,3,3758,3
+2467,einsum_default_148,call_function,einsum.default,forward,21,2,2,1,2337,3756,5
+2468,dtype_cast_193,call_function,dtype_cast.default,forward,21,1,1,1,1,3753,3
+2469,permute_233,call_function,permute.default,forward,21,1,1,1,2,3752,3
+2470,alias_default_599,call_function,alias.default,forward,21,1,1,2,3,3751,3
+2471,einsum_default_149,call_function,einsum.default,forward,21,2,2,1,2337,3749,5
+2472,view_489,call_function,view.default,forward,21,1,1,1,2338,3755,4
+2473,view_490,call_function,view.default,forward,21,1,1,1,2338,3755,4
+2474,view_491,call_function,view.default,forward,21,1,1,1,2338,3748,4
+2475,convert_element_type_512,call_function,convert_element_type.default,forward,21,1,1,1,2339,3754,4
+2476,view_492,call_function,view.default,forward,21,1,1,1,2340,3753,4
+2477,view_as_complex_42,call_function,view_as_complex.default,forward,21,1,1,1,2341,3752,6
+2478,convert_element_type_513,call_function,convert_element_type.default,forward,21,1,1,1,2339,3754,4
+2479,view_493,call_function,view.default,forward,21,1,1,1,2340,3753,4
+2480,view_as_complex_43,call_function,view_as_complex.default,forward,21,1,1,1,2341,3752,6
+2481,view_494,call_function,view.default,forward,21,1,1,1,2,3763,3
+2482,alias_default_600,call_function,alias.default,forward,21,1,1,4,3,3762,3
+2483,mul_149,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8
+2484,view_as_real_42,call_function,view_as_real.default,forward,21,1,1,1,2345,3750,6
+2485,view_495,call_function,view.default,forward,21,1,1,1,2346,3749,6
+2486,mul_150,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8
+2487,view_as_real_43,call_function,view_as_real.default,forward,21,1,1,1,2345,3750,6
+2488,view_496,call_function,view.default,forward,21,1,1,1,2346,3749,6
+2489,convert_element_type_514,call_function,convert_element_type.default,forward,21,1,1,1,2347,3748,6
+2490,convert_element_type_515,call_function,convert_element_type.default,forward,21,1,1,1,2347,3748,6
+2491,permute_234,call_function,permute.default,forward,21,1,1,1,2348,3747,6
+2492,permute_235,call_function,permute.default,forward,21,1,1,1,2348,3747,6
+2493,permute_236,call_function,permute.default,forward,21,1,1,1,2339,3747,4
+2494,alias_default_601,call_function,alias.default,forward,21,1,1,2,2349,3746,4
+2495,alias_default_602,call_function,alias.default,forward,21,1,1,2,2349,3746,4
+2496,alias_default_603,call_function,alias.default,forward,21,1,1,2,2340,3746,4
+2497,_scaled_dot_product_flash_attention_21,call_function,_scaled_dot_product_flash_attention.default,forward,21,3,3,4,2373,3745,2
+2498,getitem_189,call_function,getitem,forward,21,1,1,1,2374,3741,2
+2499,getitem_190,call_function,getitem,forward,21,1,1,1,2374,2374,2
+2500,getitem_195,call_function,getitem,forward,21,1,1,1,2374,2374,1
+2501,getitem_196,call_function,getitem,forward,21,1,1,1,2374,2374,1
+2502,alias_default_604,call_function,alias.default,forward,21,1,1,2,2375,3740,4
+2503,permute_237,call_function,permute.default,forward,21,1,1,1,2376,3739,4
+2504,view_497,call_function,view.default,forward,21,1,1,1,2377,3738,3
+2505,dtype_cast_194,call_function,dtype_cast.default,forward,21,1,1,1,1,3740,3
+2506,permute_238,call_function,permute.default,forward,21,1,1,1,2,3739,3
+2507,alias_default_605,call_function,alias.default,forward,21,1,1,2,2378,3737,4
+2508,alias_default_606,call_function,alias.default,forward,21,1,1,2,3,3738,3
+2509,einsum_default_150,call_function,einsum.default,forward,21,2,2,1,2383,3736,5
+2510,add_106,call_function,add.Tensor,forward,21,2,2,1,2384,3735,10
+2511,dtype_cast_195,call_function,dtype_cast.default,forward,21,1,1,1,1,3724,2
+2512,alias_default_607,call_function,alias.default,forward,21,1,1,3,2385,3734,4
+2513,convert_element_type_518,call_function,convert_element_type.default,forward,21,1,1,1,2386,3732,4
+2514,alias_default_609,call_function,alias.default,forward,21,1,1,2,2387,3731,4
+2515,pow_44,call_function,pow.Tensor_Scalar,forward,21,1,1,1,2388,3730,4
+2516,mean_43,call_function,mean.dim,forward,21,1,1,1,2389,3729,4
+2517,add_107,call_function,add.Scalar,forward,21,1,1,1,2390,3728,3
+2518,rsqrt_43,call_function,rsqrt.default,forward,21,1,1,1,2391,3727,3
+2519,alias_default_610,call_function,alias.default,forward,21,1,1,3,2392,3726,3
+2520,mul_151,call_function,mul.Tensor,forward,21,2,2,1,2393,3722,8
+2521,alias_default_608,call_function,alias.default,forward,21,1,1,2,2,3723,2
+2522,mul_152,call_function,mul.Tensor,forward,21,2,2,1,2397,3721,8
+2523,convert_element_type_519,call_function,convert_element_type.default,forward,21,1,1,1,2398,3720,6
+2524,dtype_cast_196,call_function,dtype_cast.default,forward,21,1,1,1,1,3720,3
+2525,permute_239,call_function,permute.default,forward,21,1,1,1,2,3719,3
+2526,alias_default_611,call_function,alias.default,forward,21,1,1,4,2399,3719,4
+2527,alias_default_612,call_function,alias.default,forward,21,1,1,2,3,3718,3
+2528,einsum_default_151,call_function,einsum.default,forward,21,2,2,1,2404,3716,5
+2529,alias_default_613,call_function,alias.default,forward,21,1,1,2,2405,3715,4
+2530,convert_element_type_522,call_function,convert_element_type.default,forward,21,1,1,1,2406,3703,4
+2531,alias_default_614,call_function,alias.default,forward,21,1,1,2,2407,3702,4
+2532,neg_21,call_function,neg.default,forward,21,1,1,1,2408,3701,8
+2533,exp_21,call_function,exp.default,forward,21,1,1,1,2409,3700,6
+2534,add_108,call_function,add.Tensor,forward,21,1,1,1,2410,3699,4
+2535,div_21,call_function,div.Tensor,forward,21,2,2,1,2411,3698,6
+2536,convert_element_type_523,call_function,convert_element_type.default,forward,21,1,1,1,2412,3697,6
+2537,dtype_cast_197,call_function,dtype_cast.default,forward,21,1,1,1,1,3701,3
+2538,permute_240,call_function,permute.default,forward,21,1,1,1,2,3700,3
+2539,alias_default_616,call_function,alias.default,forward,21,1,1,2,3,3699,3
+2540,einsum_default_152,call_function,einsum.default,forward,21,2,2,1,2404,3697,5
+2541,alias_default_615,call_function,alias.default,forward,21,1,1,2,2413,3696,4
+2542,alias_default_617,call_function,alias.default,forward,21,1,1,2,2405,3696,4
+2543,mul_153,call_function,mul.Tensor,forward,21,2,2,1,2420,3695,8
+2544,dtype_cast_198,call_function,dtype_cast.default,forward,21,1,1,1,1,3697,3
+2545,permute_241,call_function,permute.default,forward,21,1,1,1,2,3696,3
+2546,alias_default_618,call_function,alias.default,forward,21,1,1,2,2421,3694,4
+2547,alias_default_619,call_function,alias.default,forward,21,1,1,2,3,3695,3
+2548,einsum_default_153,call_function,einsum.default,forward,21,2,2,1,2426,3693,5
+2549,add_109,call_function,add.Tensor,forward,21,2,2,1,2427,3692,10
+2550,dtype_cast_199,call_function,dtype_cast.default,forward,22,1,1,1,1,3681,2
+2551,alias_default_620,call_function,alias.default,forward,21,1,1,3,2428,3691,4
+2552,convert_element_type_528,call_function,convert_element_type.default,forward,22,1,1,1,2429,3689,4
+2553,alias_default_622,call_function,alias.default,forward,22,1,1,2,2430,3688,4
+2554,pow_45,call_function,pow.Tensor_Scalar,forward,22,1,1,1,2431,3687,4
+2555,mean_44,call_function,mean.dim,forward,22,1,1,1,2432,3686,4
+2556,add_110,call_function,add.Scalar,forward,22,1,1,1,2433,3685,3
+2557,rsqrt_44,call_function,rsqrt.default,forward,22,1,1,1,2434,3684,3
+2558,alias_default_623,call_function,alias.default,forward,22,1,1,3,2435,3683,3
+2559,mul_154,call_function,mul.Tensor,forward,22,2,2,1,2436,3679,8
+2560,alias_default_621,call_function,alias.default,forward,22,1,1,2,2,3680,2
+2561,mul_155,call_function,mul.Tensor,forward,22,2,2,1,2440,3678,8
+2562,convert_element_type_529,call_function,convert_element_type.default,forward,22,1,1,1,2441,3677,6
+2563,dtype_cast_200,call_function,dtype_cast.default,forward,22,1,1,1,1,3664,3
+2564,permute_242,call_function,permute.default,forward,22,1,1,1,2,3663,3
+2565,alias_default_624,call_function,alias.default,forward,22,1,1,6,2442,3676,4
+2566,alias_default_625,call_function,alias.default,forward,22,1,1,2,3,3662,3
+2567,einsum_default_154,call_function,einsum.default,forward,22,2,2,1,2447,3660,5
+2568,dtype_cast_201,call_function,dtype_cast.default,forward,22,1,1,1,1,3664,3
+2569,permute_243,call_function,permute.default,forward,22,1,1,1,2,3663,3
+2570,alias_default_626,call_function,alias.default,forward,22,1,1,2,3,3662,3
+2571,einsum_default_155,call_function,einsum.default,forward,22,2,2,1,2447,3660,5
+2572,dtype_cast_202,call_function,dtype_cast.default,forward,22,1,1,1,1,3657,3
+2573,permute_244,call_function,permute.default,forward,22,1,1,1,2,3656,3
+2574,alias_default_627,call_function,alias.default,forward,22,1,1,2,3,3655,3
+2575,einsum_default_156,call_function,einsum.default,forward,22,2,2,1,2447,3653,5
+2576,view_512,call_function,view.default,forward,22,1,1,1,2448,3659,4
+2577,view_513,call_function,view.default,forward,22,1,1,1,2448,3659,4
+2578,view_514,call_function,view.default,forward,22,1,1,1,2448,3652,4
+2579,convert_element_type_536,call_function,convert_element_type.default,forward,22,1,1,1,2449,3658,4
+2580,view_515,call_function,view.default,forward,22,1,1,1,2450,3657,4
+2581,view_as_complex_44,call_function,view_as_complex.default,forward,22,1,1,1,2451,3656,6
+2582,convert_element_type_537,call_function,convert_element_type.default,forward,22,1,1,1,2449,3658,4
+2583,view_516,call_function,view.default,forward,22,1,1,1,2450,3657,4
+2584,view_as_complex_45,call_function,view_as_complex.default,forward,22,1,1,1,2451,3656,6
+2585,view_517,call_function,view.default,forward,22,1,1,1,2,3667,3
+2586,alias_default_628,call_function,alias.default,forward,22,1,1,4,3,3666,3
+2587,mul_156,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8
+2588,view_as_real_44,call_function,view_as_real.default,forward,22,1,1,1,2455,3654,6
+2589,view_518,call_function,view.default,forward,22,1,1,1,2456,3653,6
+2590,mul_157,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8
+2591,view_as_real_45,call_function,view_as_real.default,forward,22,1,1,1,2455,3654,6
+2592,view_519,call_function,view.default,forward,22,1,1,1,2456,3653,6
+2593,convert_element_type_538,call_function,convert_element_type.default,forward,22,1,1,1,2457,3652,6
+2594,convert_element_type_539,call_function,convert_element_type.default,forward,22,1,1,1,2457,3652,6
+2595,permute_245,call_function,permute.default,forward,22,1,1,1,2458,3651,6
+2596,permute_246,call_function,permute.default,forward,22,1,1,1,2458,3651,6
+2597,permute_247,call_function,permute.default,forward,22,1,1,1,2449,3651,4
+2598,alias_default_629,call_function,alias.default,forward,22,1,1,2,2459,3650,4
+2599,alias_default_630,call_function,alias.default,forward,22,1,1,2,2459,3650,4
+2600,alias_default_631,call_function,alias.default,forward,22,1,1,2,2450,3650,4
+2601,_scaled_dot_product_flash_attention_22,call_function,_scaled_dot_product_flash_attention.default,forward,22,3,3,4,2483,3649,2
+2602,getitem_198,call_function,getitem,forward,22,1,1,1,2484,3645,2
+2603,getitem_199,call_function,getitem,forward,22,1,1,1,2484,2484,2
+2604,getitem_204,call_function,getitem,forward,22,1,1,1,2484,2484,1
+2605,getitem_205,call_function,getitem,forward,22,1,1,1,2484,2484,1
+2606,alias_default_632,call_function,alias.default,forward,22,1,1,2,2485,3644,4
+2607,permute_248,call_function,permute.default,forward,22,1,1,1,2486,3643,4
+2608,view_520,call_function,view.default,forward,22,1,1,1,2487,3642,3
+2609,dtype_cast_203,call_function,dtype_cast.default,forward,22,1,1,1,1,3644,3
+2610,permute_249,call_function,permute.default,forward,22,1,1,1,2,3643,3
+2611,alias_default_633,call_function,alias.default,forward,22,1,1,2,2488,3641,4
+2612,alias_default_634,call_function,alias.default,forward,22,1,1,2,3,3642,3
+2613,einsum_default_157,call_function,einsum.default,forward,22,2,2,1,2493,3640,5
+2614,add_111,call_function,add.Tensor,forward,22,2,2,1,2494,3639,10
+2615,dtype_cast_204,call_function,dtype_cast.default,forward,22,1,1,1,1,3628,2
+2616,alias_default_635,call_function,alias.default,forward,22,1,1,3,2495,3638,4
+2617,convert_element_type_542,call_function,convert_element_type.default,forward,22,1,1,1,2496,3636,4
+2618,alias_default_637,call_function,alias.default,forward,22,1,1,2,2497,3635,4
+2619,pow_46,call_function,pow.Tensor_Scalar,forward,22,1,1,1,2498,3634,4
+2620,mean_45,call_function,mean.dim,forward,22,1,1,1,2499,3633,4
+2621,add_112,call_function,add.Scalar,forward,22,1,1,1,2500,3632,3
+2622,rsqrt_45,call_function,rsqrt.default,forward,22,1,1,1,2501,3631,3
+2623,alias_default_638,call_function,alias.default,forward,22,1,1,3,2502,3630,3
+2624,mul_158,call_function,mul.Tensor,forward,22,2,2,1,2503,3626,8
+2625,alias_default_636,call_function,alias.default,forward,22,1,1,2,2,3627,2
+2626,mul_159,call_function,mul.Tensor,forward,22,2,2,1,2507,3625,8
+2627,convert_element_type_543,call_function,convert_element_type.default,forward,22,1,1,1,2508,3624,6
+2628,dtype_cast_205,call_function,dtype_cast.default,forward,22,1,1,1,1,3624,3
+2629,permute_250,call_function,permute.default,forward,22,1,1,1,2,3623,3
+2630,alias_default_639,call_function,alias.default,forward,22,1,1,4,2509,3623,4
+2631,alias_default_640,call_function,alias.default,forward,22,1,1,2,3,3622,3
+2632,einsum_default_158,call_function,einsum.default,forward,22,2,2,1,2514,3620,5
+2633,alias_default_641,call_function,alias.default,forward,22,1,1,2,2515,3619,4
+2634,convert_element_type_546,call_function,convert_element_type.default,forward,22,1,1,1,2516,3607,4
+2635,alias_default_642,call_function,alias.default,forward,22,1,1,2,2517,3606,4
+2636,neg_22,call_function,neg.default,forward,22,1,1,1,2518,3605,8
+2637,exp_22,call_function,exp.default,forward,22,1,1,1,2519,3604,6
+2638,add_113,call_function,add.Tensor,forward,22,1,1,1,2520,3603,4
+2639,div_22,call_function,div.Tensor,forward,22,2,2,1,2521,3602,6
+2640,convert_element_type_547,call_function,convert_element_type.default,forward,22,1,1,1,2522,3601,6
+2641,dtype_cast_206,call_function,dtype_cast.default,forward,22,1,1,1,1,3605,3
+2642,permute_251,call_function,permute.default,forward,22,1,1,1,2,3604,3
+2643,alias_default_644,call_function,alias.default,forward,22,1,1,2,3,3603,3
+2644,einsum_default_159,call_function,einsum.default,forward,22,2,2,1,2514,3601,5
+2645,alias_default_643,call_function,alias.default,forward,22,1,1,2,2523,3600,4
+2646,alias_default_645,call_function,alias.default,forward,22,1,1,2,2515,3600,4
+2647,mul_160,call_function,mul.Tensor,forward,22,2,2,1,2530,3599,8
+2648,dtype_cast_207,call_function,dtype_cast.default,forward,22,1,1,1,1,3601,3
+2649,permute_252,call_function,permute.default,forward,22,1,1,1,2,3600,3
+2650,alias_default_646,call_function,alias.default,forward,22,1,1,2,2531,3598,4
+2651,alias_default_647,call_function,alias.default,forward,22,1,1,2,3,3599,3
+2652,einsum_default_160,call_function,einsum.default,forward,22,2,2,1,2536,3597,5
+2653,add_114,call_function,add.Tensor,forward,22,2,2,1,2537,3596,10
+2654,dtype_cast_208,call_function,dtype_cast.default,forward,23,1,1,1,1,3585,2
+2655,alias_default_648,call_function,alias.default,forward,22,1,1,3,2538,3595,4
+2656,convert_element_type_552,call_function,convert_element_type.default,forward,23,1,1,1,2539,3593,4
+2657,alias_default_650,call_function,alias.default,forward,23,1,1,2,2540,3592,4
+2658,pow_47,call_function,pow.Tensor_Scalar,forward,23,1,1,1,2541,3591,4
+2659,mean_46,call_function,mean.dim,forward,23,1,1,1,2542,3590,4
+2660,add_115,call_function,add.Scalar,forward,23,1,1,1,2543,3589,3
+2661,rsqrt_46,call_function,rsqrt.default,forward,23,1,1,1,2544,3588,3
+2662,alias_default_651,call_function,alias.default,forward,23,1,1,3,2545,3587,3
+2663,mul_161,call_function,mul.Tensor,forward,23,2,2,1,2546,3583,8
+2664,alias_default_649,call_function,alias.default,forward,23,1,1,2,2,3584,2
+2665,mul_162,call_function,mul.Tensor,forward,23,2,2,1,2550,3582,8
+2666,convert_element_type_553,call_function,convert_element_type.default,forward,23,1,1,1,2551,3581,6
+2667,dtype_cast_209,call_function,dtype_cast.default,forward,23,1,1,1,1,3568,3
+2668,permute_253,call_function,permute.default,forward,23,1,1,1,2,3567,3
+2669,alias_default_652,call_function,alias.default,forward,23,1,1,6,2552,3580,4
+2670,alias_default_653,call_function,alias.default,forward,23,1,1,2,3,3566,3
+2671,einsum_default_161,call_function,einsum.default,forward,23,2,2,1,2557,3564,5
+2672,dtype_cast_210,call_function,dtype_cast.default,forward,23,1,1,1,1,3568,3
+2673,permute_254,call_function,permute.default,forward,23,1,1,1,2,3567,3
+2674,alias_default_654,call_function,alias.default,forward,23,1,1,2,3,3566,3
+2675,einsum_default_162,call_function,einsum.default,forward,23,2,2,1,2557,3564,5
+2676,dtype_cast_211,call_function,dtype_cast.default,forward,23,1,1,1,1,3561,3
+2677,permute_255,call_function,permute.default,forward,23,1,1,1,2,3560,3
+2678,alias_default_655,call_function,alias.default,forward,23,1,1,2,3,3559,3
+2679,einsum_default_163,call_function,einsum.default,forward,23,2,2,1,2557,3557,5
+2680,view_535,call_function,view.default,forward,23,1,1,1,2558,3563,4
+2681,view_536,call_function,view.default,forward,23,1,1,1,2558,3563,4
+2682,view_537,call_function,view.default,forward,23,1,1,1,2558,3556,4
+2683,convert_element_type_560,call_function,convert_element_type.default,forward,23,1,1,1,2559,3562,4
+2684,view_538,call_function,view.default,forward,23,1,1,1,2560,3561,4
+2685,view_as_complex_46,call_function,view_as_complex.default,forward,23,1,1,1,2561,3560,6
+2686,convert_element_type_561,call_function,convert_element_type.default,forward,23,1,1,1,2559,3562,4
+2687,view_539,call_function,view.default,forward,23,1,1,1,2560,3561,4
+2688,view_as_complex_47,call_function,view_as_complex.default,forward,23,1,1,1,2561,3560,6
+2689,view_540,call_function,view.default,forward,23,1,1,1,2,3571,3
+2690,alias_default_656,call_function,alias.default,forward,23,1,1,4,3,3570,3
+2691,mul_163,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8
+2692,view_as_real_46,call_function,view_as_real.default,forward,23,1,1,1,2565,3558,6
+2693,view_541,call_function,view.default,forward,23,1,1,1,2566,3557,6
+2694,mul_164,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8
+2695,view_as_real_47,call_function,view_as_real.default,forward,23,1,1,1,2565,3558,6
+2696,view_542,call_function,view.default,forward,23,1,1,1,2566,3557,6
+2697,convert_element_type_562,call_function,convert_element_type.default,forward,23,1,1,1,2567,3556,6
+2698,convert_element_type_563,call_function,convert_element_type.default,forward,23,1,1,1,2567,3556,6
+2699,permute_256,call_function,permute.default,forward,23,1,1,1,2568,3555,6
+2700,permute_257,call_function,permute.default,forward,23,1,1,1,2568,3555,6
+2701,permute_258,call_function,permute.default,forward,23,1,1,1,2559,3555,4
+2702,alias_default_657,call_function,alias.default,forward,23,1,1,2,2569,3554,4
+2703,alias_default_658,call_function,alias.default,forward,23,1,1,2,2569,3554,4
+2704,alias_default_659,call_function,alias.default,forward,23,1,1,2,2560,3554,4
+2705,_scaled_dot_product_flash_attention_23,call_function,_scaled_dot_product_flash_attention.default,forward,23,3,3,4,2593,3553,2
+2706,getitem_207,call_function,getitem,forward,23,1,1,1,2594,3549,2
+2707,getitem_208,call_function,getitem,forward,23,1,1,1,2594,2594,2
+2708,getitem_213,call_function,getitem,forward,23,1,1,1,2594,2594,1
+2709,getitem_214,call_function,getitem,forward,23,1,1,1,2594,2594,1
+2710,alias_default_660,call_function,alias.default,forward,23,1,1,2,2595,3548,4
+2711,permute_259,call_function,permute.default,forward,23,1,1,1,2596,3547,4
+2712,view_543,call_function,view.default,forward,23,1,1,1,2597,3546,3
+2713,dtype_cast_212,call_function,dtype_cast.default,forward,23,1,1,1,1,3548,3
+2714,permute_260,call_function,permute.default,forward,23,1,1,1,2,3547,3
+2715,alias_default_661,call_function,alias.default,forward,23,1,1,2,2598,3545,4
+2716,alias_default_662,call_function,alias.default,forward,23,1,1,2,3,3546,3
+2717,einsum_default_164,call_function,einsum.default,forward,23,2,2,1,2603,3544,5
+2718,add_116,call_function,add.Tensor,forward,23,2,2,1,2604,3543,10
+2719,dtype_cast_213,call_function,dtype_cast.default,forward,23,1,1,1,1,3532,2
+2720,alias_default_663,call_function,alias.default,forward,23,1,1,3,2605,3542,4
+2721,convert_element_type_566,call_function,convert_element_type.default,forward,23,1,1,1,2606,3540,4
+2722,alias_default_665,call_function,alias.default,forward,23,1,1,2,2607,3539,4
+2723,pow_48,call_function,pow.Tensor_Scalar,forward,23,1,1,1,2608,3538,4
+2724,mean_47,call_function,mean.dim,forward,23,1,1,1,2609,3537,4
+2725,add_117,call_function,add.Scalar,forward,23,1,1,1,2610,3536,3
+2726,rsqrt_47,call_function,rsqrt.default,forward,23,1,1,1,2611,3535,3
+2727,alias_default_666,call_function,alias.default,forward,23,1,1,3,2612,3534,3
+2728,mul_165,call_function,mul.Tensor,forward,23,2,2,1,2613,3530,8
+2729,alias_default_664,call_function,alias.default,forward,23,1,1,2,2,3531,2
+2730,mul_166,call_function,mul.Tensor,forward,23,2,2,1,2617,3529,8
+2731,convert_element_type_567,call_function,convert_element_type.default,forward,23,1,1,1,2618,3528,6
+2732,dtype_cast_214,call_function,dtype_cast.default,forward,23,1,1,1,1,3528,3
+2733,permute_261,call_function,permute.default,forward,23,1,1,1,2,3527,3
+2734,alias_default_667,call_function,alias.default,forward,23,1,1,4,2619,3527,4
+2735,alias_default_668,call_function,alias.default,forward,23,1,1,2,3,3526,3
+2736,einsum_default_165,call_function,einsum.default,forward,23,2,2,1,2624,3524,5
+2737,alias_default_669,call_function,alias.default,forward,23,1,1,2,2625,3523,4
+2738,convert_element_type_570,call_function,convert_element_type.default,forward,23,1,1,1,2626,3511,4
+2739,alias_default_670,call_function,alias.default,forward,23,1,1,2,2627,3510,4
+2740,neg_23,call_function,neg.default,forward,23,1,1,1,2628,3509,8
+2741,exp_23,call_function,exp.default,forward,23,1,1,1,2629,3508,6
+2742,add_118,call_function,add.Tensor,forward,23,1,1,1,2630,3507,4
+2743,div_23,call_function,div.Tensor,forward,23,2,2,1,2631,3506,6
+2744,convert_element_type_571,call_function,convert_element_type.default,forward,23,1,1,1,2632,3505,6
+2745,dtype_cast_215,call_function,dtype_cast.default,forward,23,1,1,1,1,3509,3
+2746,permute_262,call_function,permute.default,forward,23,1,1,1,2,3508,3
+2747,alias_default_672,call_function,alias.default,forward,23,1,1,2,3,3507,3
+2748,einsum_default_166,call_function,einsum.default,forward,23,2,2,1,2624,3505,5
+2749,alias_default_671,call_function,alias.default,forward,23,1,1,2,2633,3504,4
+2750,alias_default_673,call_function,alias.default,forward,23,1,1,2,2625,3504,4
+2751,mul_167,call_function,mul.Tensor,forward,23,2,2,1,2640,3503,8
+2752,dtype_cast_216,call_function,dtype_cast.default,forward,23,1,1,1,1,3505,3
+2753,permute_263,call_function,permute.default,forward,23,1,1,1,2,3504,3
+2754,alias_default_674,call_function,alias.default,forward,23,1,1,2,2641,3502,4
+2755,alias_default_675,call_function,alias.default,forward,23,1,1,2,3,3503,3
+2756,einsum_default_167,call_function,einsum.default,forward,23,2,2,1,2646,3501,5
+2757,add_119,call_function,add.Tensor,forward,23,2,2,1,2647,3500,10
+2758,dtype_cast_217,call_function,dtype_cast.default,forward,24,1,1,1,1,3489,2
+2759,alias_default_676,call_function,alias.default,forward,23,1,1,3,2648,3499,4
+2760,convert_element_type_576,call_function,convert_element_type.default,forward,24,1,1,1,2649,3497,4
+2761,alias_default_678,call_function,alias.default,forward,24,1,1,2,2650,3496,4
+2762,pow_49,call_function,pow.Tensor_Scalar,forward,24,1,1,1,2651,3495,4
+2763,mean_48,call_function,mean.dim,forward,24,1,1,1,2652,3494,4
+2764,add_120,call_function,add.Scalar,forward,24,1,1,1,2653,3493,3
+2765,rsqrt_48,call_function,rsqrt.default,forward,24,1,1,1,2654,3492,3
+2766,alias_default_679,call_function,alias.default,forward,24,1,1,3,2655,3491,3
+2767,mul_168,call_function,mul.Tensor,forward,24,2,2,1,2656,3487,8
+2768,alias_default_677,call_function,alias.default,forward,24,1,1,2,2,3488,2
+2769,mul_169,call_function,mul.Tensor,forward,24,2,2,1,2660,3486,8
+2770,convert_element_type_577,call_function,convert_element_type.default,forward,24,1,1,1,2661,3485,6
+2771,dtype_cast_218,call_function,dtype_cast.default,forward,24,1,1,1,1,3472,3
+2772,permute_264,call_function,permute.default,forward,24,1,1,1,2,3471,3
+2773,alias_default_680,call_function,alias.default,forward,24,1,1,6,2662,3484,4
+2774,alias_default_681,call_function,alias.default,forward,24,1,1,2,3,3470,3
+2775,einsum_default_168,call_function,einsum.default,forward,24,2,2,1,2667,3468,5
+2776,dtype_cast_219,call_function,dtype_cast.default,forward,24,1,1,1,1,3472,3
+2777,permute_265,call_function,permute.default,forward,24,1,1,1,2,3471,3
+2778,alias_default_682,call_function,alias.default,forward,24,1,1,2,3,3470,3
+2779,einsum_default_169,call_function,einsum.default,forward,24,2,2,1,2667,3468,5
+2780,dtype_cast_220,call_function,dtype_cast.default,forward,24,1,1,1,1,3465,3
+2781,permute_266,call_function,permute.default,forward,24,1,1,1,2,3464,3
+2782,alias_default_683,call_function,alias.default,forward,24,1,1,2,3,3463,3
+2783,einsum_default_170,call_function,einsum.default,forward,24,2,2,1,2667,3461,5
+2784,view_558,call_function,view.default,forward,24,1,1,1,2668,3467,4
+2785,view_559,call_function,view.default,forward,24,1,1,1,2668,3467,4
+2786,view_560,call_function,view.default,forward,24,1,1,1,2668,3460,4
+2787,convert_element_type_584,call_function,convert_element_type.default,forward,24,1,1,1,2669,3466,4
+2788,view_561,call_function,view.default,forward,24,1,1,1,2670,3465,4
+2789,view_as_complex_48,call_function,view_as_complex.default,forward,24,1,1,1,2671,3464,6
+2790,convert_element_type_585,call_function,convert_element_type.default,forward,24,1,1,1,2669,3466,4
+2791,view_562,call_function,view.default,forward,24,1,1,1,2670,3465,4
+2792,view_as_complex_49,call_function,view_as_complex.default,forward,24,1,1,1,2671,3464,6
+2793,view_563,call_function,view.default,forward,24,1,1,1,2,3475,3
+2794,alias_default_684,call_function,alias.default,forward,24,1,1,4,3,3474,3
+2795,mul_170,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8
+2796,view_as_real_48,call_function,view_as_real.default,forward,24,1,1,1,2675,3462,6
+2797,view_564,call_function,view.default,forward,24,1,1,1,2676,3461,6
+2798,mul_171,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8
+2799,view_as_real_49,call_function,view_as_real.default,forward,24,1,1,1,2675,3462,6
+2800,view_565,call_function,view.default,forward,24,1,1,1,2676,3461,6
+2801,convert_element_type_586,call_function,convert_element_type.default,forward,24,1,1,1,2677,3460,6
+2802,convert_element_type_587,call_function,convert_element_type.default,forward,24,1,1,1,2677,3460,6
+2803,permute_267,call_function,permute.default,forward,24,1,1,1,2678,3459,6
+2804,permute_268,call_function,permute.default,forward,24,1,1,1,2678,3459,6
+2805,permute_269,call_function,permute.default,forward,24,1,1,1,2669,3459,4
+2806,alias_default_685,call_function,alias.default,forward,24,1,1,2,2679,3458,4
+2807,alias_default_686,call_function,alias.default,forward,24,1,1,2,2679,3458,4
+2808,alias_default_687,call_function,alias.default,forward,24,1,1,2,2670,3458,4
+2809,_scaled_dot_product_flash_attention_24,call_function,_scaled_dot_product_flash_attention.default,forward,24,3,3,4,2703,3457,2
+2810,getitem_216,call_function,getitem,forward,24,1,1,1,2704,3453,2
+2811,getitem_217,call_function,getitem,forward,24,1,1,1,2704,2704,2
+2812,getitem_222,call_function,getitem,forward,24,1,1,1,2704,2704,1
+2813,getitem_223,call_function,getitem,forward,24,1,1,1,2704,2704,1
+2814,alias_default_688,call_function,alias.default,forward,24,1,1,2,2705,3452,4
+2815,permute_270,call_function,permute.default,forward,24,1,1,1,2706,3451,4
+2816,view_566,call_function,view.default,forward,24,1,1,1,2707,3450,3
+2817,dtype_cast_221,call_function,dtype_cast.default,forward,24,1,1,1,1,3452,3
+2818,permute_271,call_function,permute.default,forward,24,1,1,1,2,3451,3
+2819,alias_default_689,call_function,alias.default,forward,24,1,1,2,2708,3449,4
+2820,alias_default_690,call_function,alias.default,forward,24,1,1,2,3,3450,3
+2821,einsum_default_171,call_function,einsum.default,forward,24,2,2,1,2713,3448,5
+2822,add_121,call_function,add.Tensor,forward,24,2,2,1,2714,3447,10
+2823,dtype_cast_222,call_function,dtype_cast.default,forward,24,1,1,1,1,3436,2
+2824,alias_default_691,call_function,alias.default,forward,24,1,1,3,2715,3446,4
+2825,convert_element_type_590,call_function,convert_element_type.default,forward,24,1,1,1,2716,3444,4
+2826,alias_default_693,call_function,alias.default,forward,24,1,1,2,2717,3443,4
+2827,pow_50,call_function,pow.Tensor_Scalar,forward,24,1,1,1,2718,3442,4
+2828,mean_49,call_function,mean.dim,forward,24,1,1,1,2719,3441,4
+2829,add_122,call_function,add.Scalar,forward,24,1,1,1,2720,3440,3
+2830,rsqrt_49,call_function,rsqrt.default,forward,24,1,1,1,2721,3439,3
+2831,alias_default_694,call_function,alias.default,forward,24,1,1,3,2722,3438,3
+2832,mul_172,call_function,mul.Tensor,forward,24,2,2,1,2723,3434,8
+2833,alias_default_692,call_function,alias.default,forward,24,1,1,2,2,3435,2
+2834,mul_173,call_function,mul.Tensor,forward,24,2,2,1,2727,3433,8
+2835,convert_element_type_591,call_function,convert_element_type.default,forward,24,1,1,1,2728,3432,6
+2836,dtype_cast_223,call_function,dtype_cast.default,forward,24,1,1,1,1,3432,3
+2837,permute_272,call_function,permute.default,forward,24,1,1,1,2,3431,3
+2838,alias_default_695,call_function,alias.default,forward,24,1,1,4,2729,3431,4
+2839,alias_default_696,call_function,alias.default,forward,24,1,1,2,3,3430,3
+2840,einsum_default_172,call_function,einsum.default,forward,24,2,2,1,2734,3428,5
+2841,alias_default_697,call_function,alias.default,forward,24,1,1,2,2735,3427,4
+2842,convert_element_type_594,call_function,convert_element_type.default,forward,24,1,1,1,2736,3415,4
+2843,alias_default_698,call_function,alias.default,forward,24,1,1,2,2737,3414,4
+2844,neg_24,call_function,neg.default,forward,24,1,1,1,2738,3413,8
+2845,exp_24,call_function,exp.default,forward,24,1,1,1,2739,3412,6
+2846,add_123,call_function,add.Tensor,forward,24,1,1,1,2740,3411,4
+2847,div_24,call_function,div.Tensor,forward,24,2,2,1,2741,3410,6
+2848,convert_element_type_595,call_function,convert_element_type.default,forward,24,1,1,1,2742,3409,6
+2849,dtype_cast_224,call_function,dtype_cast.default,forward,24,1,1,1,1,3413,3
+2850,permute_273,call_function,permute.default,forward,24,1,1,1,2,3412,3
+2851,alias_default_700,call_function,alias.default,forward,24,1,1,2,3,3411,3
+2852,einsum_default_173,call_function,einsum.default,forward,24,2,2,1,2734,3409,5
+2853,alias_default_699,call_function,alias.default,forward,24,1,1,2,2743,3408,4
+2854,alias_default_701,call_function,alias.default,forward,24,1,1,2,2735,3408,4
+2855,mul_174,call_function,mul.Tensor,forward,24,2,2,1,2750,3407,8
+2856,dtype_cast_225,call_function,dtype_cast.default,forward,24,1,1,1,1,3409,3
+2857,permute_274,call_function,permute.default,forward,24,1,1,1,2,3408,3
+2858,alias_default_702,call_function,alias.default,forward,24,1,1,2,2751,3406,4
+2859,alias_default_703,call_function,alias.default,forward,24,1,1,2,3,3407,3
+2860,einsum_default_174,call_function,einsum.default,forward,24,2,2,1,2756,3405,5
+2861,add_124,call_function,add.Tensor,forward,24,2,2,1,2757,3404,10
+2862,dtype_cast_226,call_function,dtype_cast.default,forward,25,1,1,1,1,3393,2
+2863,alias_default_704,call_function,alias.default,forward,24,1,1,3,2758,3403,4
+2864,convert_element_type_600,call_function,convert_element_type.default,forward,25,1,1,1,2759,3401,4
+2865,alias_default_706,call_function,alias.default,forward,25,1,1,2,2760,3400,4
+2866,pow_51,call_function,pow.Tensor_Scalar,forward,25,1,1,1,2761,3399,4
+2867,mean_50,call_function,mean.dim,forward,25,1,1,1,2762,3398,4
+2868,add_125,call_function,add.Scalar,forward,25,1,1,1,2763,3397,3
+2869,rsqrt_50,call_function,rsqrt.default,forward,25,1,1,1,2764,3396,3
+2870,alias_default_707,call_function,alias.default,forward,25,1,1,3,2765,3395,3
+2871,mul_175,call_function,mul.Tensor,forward,25,2,2,1,2766,3391,8
+2872,alias_default_705,call_function,alias.default,forward,25,1,1,2,2,3392,2
+2873,mul_176,call_function,mul.Tensor,forward,25,2,2,1,2770,3390,8
+2874,convert_element_type_601,call_function,convert_element_type.default,forward,25,1,1,1,2771,3389,6
+2875,dtype_cast_227,call_function,dtype_cast.default,forward,25,1,1,1,1,3376,3
+2876,permute_275,call_function,permute.default,forward,25,1,1,1,2,3375,3
+2877,alias_default_708,call_function,alias.default,forward,25,1,1,6,2772,3388,4
+2878,alias_default_709,call_function,alias.default,forward,25,1,1,2,3,3374,3
+2879,einsum_default_175,call_function,einsum.default,forward,25,2,2,1,2777,3372,5
+2880,dtype_cast_228,call_function,dtype_cast.default,forward,25,1,1,1,1,3376,3
+2881,permute_276,call_function,permute.default,forward,25,1,1,1,2,3375,3
+2882,alias_default_710,call_function,alias.default,forward,25,1,1,2,3,3374,3
+2883,einsum_default_176,call_function,einsum.default,forward,25,2,2,1,2777,3372,5
+2884,dtype_cast_229,call_function,dtype_cast.default,forward,25,1,1,1,1,3369,3
+2885,permute_277,call_function,permute.default,forward,25,1,1,1,2,3368,3
+2886,alias_default_711,call_function,alias.default,forward,25,1,1,2,3,3367,3
+2887,einsum_default_177,call_function,einsum.default,forward,25,2,2,1,2777,3365,5
+2888,view_581,call_function,view.default,forward,25,1,1,1,2778,3371,4
+2889,view_582,call_function,view.default,forward,25,1,1,1,2778,3371,4
+2890,view_583,call_function,view.default,forward,25,1,1,1,2778,3364,4
+2891,convert_element_type_608,call_function,convert_element_type.default,forward,25,1,1,1,2779,3370,4
+2892,view_584,call_function,view.default,forward,25,1,1,1,2780,3369,4
+2893,view_as_complex_50,call_function,view_as_complex.default,forward,25,1,1,1,2781,3368,6
+2894,convert_element_type_609,call_function,convert_element_type.default,forward,25,1,1,1,2779,3370,4
+2895,view_585,call_function,view.default,forward,25,1,1,1,2780,3369,4
+2896,view_as_complex_51,call_function,view_as_complex.default,forward,25,1,1,1,2781,3368,6
+2897,view_586,call_function,view.default,forward,25,1,1,1,2,3379,3
+2898,alias_default_712,call_function,alias.default,forward,25,1,1,4,3,3378,3
+2899,mul_177,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8
+2900,view_as_real_50,call_function,view_as_real.default,forward,25,1,1,1,2785,3366,6
+2901,view_587,call_function,view.default,forward,25,1,1,1,2786,3365,6
+2902,mul_178,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8
+2903,view_as_real_51,call_function,view_as_real.default,forward,25,1,1,1,2785,3366,6
+2904,view_588,call_function,view.default,forward,25,1,1,1,2786,3365,6
+2905,convert_element_type_610,call_function,convert_element_type.default,forward,25,1,1,1,2787,3364,6
+2906,convert_element_type_611,call_function,convert_element_type.default,forward,25,1,1,1,2787,3364,6
+2907,permute_278,call_function,permute.default,forward,25,1,1,1,2788,3363,6
+2908,permute_279,call_function,permute.default,forward,25,1,1,1,2788,3363,6
+2909,permute_280,call_function,permute.default,forward,25,1,1,1,2779,3363,4
+2910,alias_default_713,call_function,alias.default,forward,25,1,1,2,2789,3362,4
+2911,alias_default_714,call_function,alias.default,forward,25,1,1,2,2789,3362,4
+2912,alias_default_715,call_function,alias.default,forward,25,1,1,2,2780,3362,4
+2913,_scaled_dot_product_flash_attention_25,call_function,_scaled_dot_product_flash_attention.default,forward,25,3,3,4,2813,3361,2
+2914,getitem_225,call_function,getitem,forward,25,1,1,1,2814,3357,2
+2915,getitem_226,call_function,getitem,forward,25,1,1,1,2814,2814,2
+2916,getitem_231,call_function,getitem,forward,25,1,1,1,2814,2814,1
+2917,getitem_232,call_function,getitem,forward,25,1,1,1,2814,2814,1
+2918,alias_default_716,call_function,alias.default,forward,25,1,1,2,2815,3356,4
+2919,permute_281,call_function,permute.default,forward,25,1,1,1,2816,3355,4
+2920,view_589,call_function,view.default,forward,25,1,1,1,2817,3354,3
+2921,dtype_cast_230,call_function,dtype_cast.default,forward,25,1,1,1,1,3356,3
+2922,permute_282,call_function,permute.default,forward,25,1,1,1,2,3355,3
+2923,alias_default_717,call_function,alias.default,forward,25,1,1,2,2818,3353,4
+2924,alias_default_718,call_function,alias.default,forward,25,1,1,2,3,3354,3
+2925,einsum_default_178,call_function,einsum.default,forward,25,2,2,1,2823,3352,5
+2926,add_126,call_function,add.Tensor,forward,25,2,2,1,2824,3351,10
+2927,dtype_cast_231,call_function,dtype_cast.default,forward,25,1,1,1,1,3340,2
+2928,alias_default_719,call_function,alias.default,forward,25,1,1,3,2825,3350,4
+2929,convert_element_type_614,call_function,convert_element_type.default,forward,25,1,1,1,2826,3348,4
+2930,alias_default_721,call_function,alias.default,forward,25,1,1,2,2827,3347,4
+2931,pow_52,call_function,pow.Tensor_Scalar,forward,25,1,1,1,2828,3346,4
+2932,mean_51,call_function,mean.dim,forward,25,1,1,1,2829,3345,4
+2933,add_127,call_function,add.Scalar,forward,25,1,1,1,2830,3344,3
+2934,rsqrt_51,call_function,rsqrt.default,forward,25,1,1,1,2831,3343,3
+2935,alias_default_722,call_function,alias.default,forward,25,1,1,3,2832,3342,3
+2936,mul_179,call_function,mul.Tensor,forward,25,2,2,1,2833,3338,8
+2937,alias_default_720,call_function,alias.default,forward,25,1,1,2,2,3339,2
+2938,mul_180,call_function,mul.Tensor,forward,25,2,2,1,2837,3337,8
+2939,convert_element_type_615,call_function,convert_element_type.default,forward,25,1,1,1,2838,3336,6
+2940,dtype_cast_232,call_function,dtype_cast.default,forward,25,1,1,1,1,3336,3
+2941,permute_283,call_function,permute.default,forward,25,1,1,1,2,3335,3
+2942,alias_default_723,call_function,alias.default,forward,25,1,1,4,2839,3335,4
+2943,alias_default_724,call_function,alias.default,forward,25,1,1,2,3,3334,3
+2944,einsum_default_179,call_function,einsum.default,forward,25,2,2,1,2844,3332,5
+2945,alias_default_725,call_function,alias.default,forward,25,1,1,2,2845,3331,4
+2946,convert_element_type_618,call_function,convert_element_type.default,forward,25,1,1,1,2846,3319,4
+2947,alias_default_726,call_function,alias.default,forward,25,1,1,2,2847,3318,4
+2948,neg_25,call_function,neg.default,forward,25,1,1,1,2848,3317,8
+2949,exp_25,call_function,exp.default,forward,25,1,1,1,2849,3316,6
+2950,add_128,call_function,add.Tensor,forward,25,1,1,1,2850,3315,4
+2951,div_25,call_function,div.Tensor,forward,25,2,2,1,2851,3314,6
+2952,convert_element_type_619,call_function,convert_element_type.default,forward,25,1,1,1,2852,3313,6
+2953,dtype_cast_233,call_function,dtype_cast.default,forward,25,1,1,1,1,3317,3
+2954,permute_284,call_function,permute.default,forward,25,1,1,1,2,3316,3
+2955,alias_default_728,call_function,alias.default,forward,25,1,1,2,3,3315,3
+2956,einsum_default_180,call_function,einsum.default,forward,25,2,2,1,2844,3313,5
+2957,alias_default_727,call_function,alias.default,forward,25,1,1,2,2853,3312,4
+2958,alias_default_729,call_function,alias.default,forward,25,1,1,2,2845,3312,4
+2959,mul_181,call_function,mul.Tensor,forward,25,2,2,1,2860,3311,8
+2960,dtype_cast_234,call_function,dtype_cast.default,forward,25,1,1,1,1,3313,3
+2961,permute_285,call_function,permute.default,forward,25,1,1,1,2,3312,3
+2962,alias_default_730,call_function,alias.default,forward,25,1,1,2,2861,3310,4
+2963,alias_default_731,call_function,alias.default,forward,25,1,1,2,3,3311,3
+2964,einsum_default_181,call_function,einsum.default,forward,25,2,2,1,2866,3309,5
+2965,add_129,call_function,add.Tensor,forward,25,2,2,1,2867,3308,10
+2966,dtype_cast_235,call_function,dtype_cast.default,forward,26,1,1,1,1,3297,2
+2967,alias_default_732,call_function,alias.default,forward,25,1,1,3,2868,3307,4
+2968,convert_element_type_624,call_function,convert_element_type.default,forward,26,1,1,1,2869,3305,4
+2969,alias_default_734,call_function,alias.default,forward,26,1,1,2,2870,3304,4
+2970,pow_53,call_function,pow.Tensor_Scalar,forward,26,1,1,1,2871,3303,4
+2971,mean_52,call_function,mean.dim,forward,26,1,1,1,2872,3302,4
+2972,add_130,call_function,add.Scalar,forward,26,1,1,1,2873,3301,3
+2973,rsqrt_52,call_function,rsqrt.default,forward,26,1,1,1,2874,3300,3
+2974,alias_default_735,call_function,alias.default,forward,26,1,1,3,2875,3299,3
+2975,mul_182,call_function,mul.Tensor,forward,26,2,2,1,2876,3295,8
+2976,alias_default_733,call_function,alias.default,forward,26,1,1,2,2,3296,2
+2977,mul_183,call_function,mul.Tensor,forward,26,2,2,1,2880,3294,8
+2978,convert_element_type_625,call_function,convert_element_type.default,forward,26,1,1,1,2881,3293,6
+2979,dtype_cast_236,call_function,dtype_cast.default,forward,26,1,1,1,1,3280,3
+2980,permute_286,call_function,permute.default,forward,26,1,1,1,2,3279,3
+2981,alias_default_736,call_function,alias.default,forward,26,1,1,6,2882,3292,4
+2982,alias_default_737,call_function,alias.default,forward,26,1,1,2,3,3278,3
+2983,einsum_default_182,call_function,einsum.default,forward,26,2,2,1,2887,3276,5
+2984,dtype_cast_237,call_function,dtype_cast.default,forward,26,1,1,1,1,3280,3
+2985,permute_287,call_function,permute.default,forward,26,1,1,1,2,3279,3
+2986,alias_default_738,call_function,alias.default,forward,26,1,1,2,3,3278,3
+2987,einsum_default_183,call_function,einsum.default,forward,26,2,2,1,2887,3276,5
+2988,dtype_cast_238,call_function,dtype_cast.default,forward,26,1,1,1,1,3273,3
+2989,permute_288,call_function,permute.default,forward,26,1,1,1,2,3272,3
+2990,alias_default_739,call_function,alias.default,forward,26,1,1,2,3,3271,3
+2991,einsum_default_184,call_function,einsum.default,forward,26,2,2,1,2887,3269,5
+2992,view_604,call_function,view.default,forward,26,1,1,1,2888,3275,4
+2993,view_605,call_function,view.default,forward,26,1,1,1,2888,3275,4
+2994,view_606,call_function,view.default,forward,26,1,1,1,2888,3268,4
+2995,convert_element_type_632,call_function,convert_element_type.default,forward,26,1,1,1,2889,3274,4
+2996,view_607,call_function,view.default,forward,26,1,1,1,2890,3273,4
+2997,view_as_complex_52,call_function,view_as_complex.default,forward,26,1,1,1,2891,3272,6
+2998,convert_element_type_633,call_function,convert_element_type.default,forward,26,1,1,1,2889,3274,4
+2999,view_608,call_function,view.default,forward,26,1,1,1,2890,3273,4
+3000,view_as_complex_53,call_function,view_as_complex.default,forward,26,1,1,1,2891,3272,6
+3001,view_609,call_function,view.default,forward,26,1,1,1,2,3283,3
+3002,alias_default_740,call_function,alias.default,forward,26,1,1,4,3,3282,3
+3003,mul_184,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8
+3004,view_as_real_52,call_function,view_as_real.default,forward,26,1,1,1,2895,3270,6
+3005,view_610,call_function,view.default,forward,26,1,1,1,2896,3269,6
+3006,mul_185,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8
+3007,view_as_real_53,call_function,view_as_real.default,forward,26,1,1,1,2895,3270,6
+3008,view_611,call_function,view.default,forward,26,1,1,1,2896,3269,6
+3009,convert_element_type_634,call_function,convert_element_type.default,forward,26,1,1,1,2897,3268,6
+3010,convert_element_type_635,call_function,convert_element_type.default,forward,26,1,1,1,2897,3268,6
+3011,permute_289,call_function,permute.default,forward,26,1,1,1,2898,3267,6
+3012,permute_290,call_function,permute.default,forward,26,1,1,1,2898,3267,6
+3013,permute_291,call_function,permute.default,forward,26,1,1,1,2889,3267,4
+3014,alias_default_741,call_function,alias.default,forward,26,1,1,2,2899,3266,4
+3015,alias_default_742,call_function,alias.default,forward,26,1,1,2,2899,3266,4
+3016,alias_default_743,call_function,alias.default,forward,26,1,1,2,2890,3266,4
+3017,_scaled_dot_product_flash_attention_26,call_function,_scaled_dot_product_flash_attention.default,forward,26,3,3,4,2923,3265,2
+3018,getitem_234,call_function,getitem,forward,26,1,1,1,2924,3261,2
+3019,getitem_235,call_function,getitem,forward,26,1,1,1,2924,2924,2
+3020,getitem_240,call_function,getitem,forward,26,1,1,1,2924,2924,1
+3021,getitem_241,call_function,getitem,forward,26,1,1,1,2924,2924,1
+3022,alias_default_744,call_function,alias.default,forward,26,1,1,2,2925,3260,4
+3023,permute_292,call_function,permute.default,forward,26,1,1,1,2926,3259,4
+3024,view_612,call_function,view.default,forward,26,1,1,1,2927,3258,3
+3025,dtype_cast_239,call_function,dtype_cast.default,forward,26,1,1,1,1,3260,3
+3026,permute_293,call_function,permute.default,forward,26,1,1,1,2,3259,3
+3027,alias_default_745,call_function,alias.default,forward,26,1,1,2,2928,3257,4
+3028,alias_default_746,call_function,alias.default,forward,26,1,1,2,3,3258,3
+3029,einsum_default_185,call_function,einsum.default,forward,26,2,2,1,2933,3256,5
+3030,add_131,call_function,add.Tensor,forward,26,2,2,1,2934,3255,10
+3031,dtype_cast_240,call_function,dtype_cast.default,forward,26,1,1,1,1,3244,2
+3032,alias_default_747,call_function,alias.default,forward,26,1,1,3,2935,3254,4
+3033,convert_element_type_638,call_function,convert_element_type.default,forward,26,1,1,1,2936,3252,4
+3034,alias_default_749,call_function,alias.default,forward,26,1,1,2,2937,3251,4
+3035,pow_54,call_function,pow.Tensor_Scalar,forward,26,1,1,1,2938,3250,4
+3036,mean_53,call_function,mean.dim,forward,26,1,1,1,2939,3249,4
+3037,add_132,call_function,add.Scalar,forward,26,1,1,1,2940,3248,3
+3038,rsqrt_53,call_function,rsqrt.default,forward,26,1,1,1,2941,3247,3
+3039,alias_default_750,call_function,alias.default,forward,26,1,1,3,2942,3246,3
+3040,mul_186,call_function,mul.Tensor,forward,26,2,2,1,2943,3242,8
+3041,alias_default_748,call_function,alias.default,forward,26,1,1,2,2,3243,2
+3042,mul_187,call_function,mul.Tensor,forward,26,2,2,1,2947,3241,8
+3043,convert_element_type_639,call_function,convert_element_type.default,forward,26,1,1,1,2948,3240,6
+3044,dtype_cast_241,call_function,dtype_cast.default,forward,26,1,1,1,1,3240,3
+3045,permute_294,call_function,permute.default,forward,26,1,1,1,2,3239,3
+3046,alias_default_751,call_function,alias.default,forward,26,1,1,4,2949,3239,4
+3047,alias_default_752,call_function,alias.default,forward,26,1,1,2,3,3238,3
+3048,einsum_default_186,call_function,einsum.default,forward,26,2,2,1,2954,3236,5
+3049,alias_default_753,call_function,alias.default,forward,26,1,1,2,2955,3235,4
+3050,convert_element_type_642,call_function,convert_element_type.default,forward,26,1,1,1,2956,3223,4
+3051,alias_default_754,call_function,alias.default,forward,26,1,1,2,2957,3222,4
+3052,neg_26,call_function,neg.default,forward,26,1,1,1,2958,3221,8
+3053,exp_26,call_function,exp.default,forward,26,1,1,1,2959,3220,6
+3054,add_133,call_function,add.Tensor,forward,26,1,1,1,2960,3219,4
+3055,div_26,call_function,div.Tensor,forward,26,2,2,1,2961,3218,6
+3056,convert_element_type_643,call_function,convert_element_type.default,forward,26,1,1,1,2962,3217,6
+3057,dtype_cast_242,call_function,dtype_cast.default,forward,26,1,1,1,1,3221,3
+3058,permute_295,call_function,permute.default,forward,26,1,1,1,2,3220,3
+3059,alias_default_756,call_function,alias.default,forward,26,1,1,2,3,3219,3
+3060,einsum_default_187,call_function,einsum.default,forward,26,2,2,1,2954,3217,5
+3061,alias_default_755,call_function,alias.default,forward,26,1,1,2,2963,3216,4
+3062,alias_default_757,call_function,alias.default,forward,26,1,1,2,2955,3216,4
+3063,mul_188,call_function,mul.Tensor,forward,26,2,2,1,2970,3215,8
+3064,dtype_cast_243,call_function,dtype_cast.default,forward,26,1,1,1,1,3217,3
+3065,permute_296,call_function,permute.default,forward,26,1,1,1,2,3216,3
+3066,alias_default_758,call_function,alias.default,forward,26,1,1,2,2971,3214,4
+3067,alias_default_759,call_function,alias.default,forward,26,1,1,2,3,3215,3
+3068,einsum_default_188,call_function,einsum.default,forward,26,2,2,1,2976,3213,5
+3069,add_134,call_function,add.Tensor,forward,26,2,2,1,2977,3212,10
+3070,dtype_cast_244,call_function,dtype_cast.default,forward,27,1,1,1,1,3201,2
+3071,alias_default_760,call_function,alias.default,forward,26,1,1,3,2978,3211,4
+3072,convert_element_type_648,call_function,convert_element_type.default,forward,27,1,1,1,2979,3209,4
+3073,alias_default_762,call_function,alias.default,forward,27,1,1,2,2980,3208,4
+3074,pow_55,call_function,pow.Tensor_Scalar,forward,27,1,1,1,2981,3207,4
+3075,mean_54,call_function,mean.dim,forward,27,1,1,1,2982,3206,4
+3076,add_135,call_function,add.Scalar,forward,27,1,1,1,2983,3205,3
+3077,rsqrt_54,call_function,rsqrt.default,forward,27,1,1,1,2984,3204,3
+3078,alias_default_763,call_function,alias.default,forward,27,1,1,3,2985,3203,3
+3079,mul_189,call_function,mul.Tensor,forward,27,2,2,1,2986,3199,8
+3080,alias_default_761,call_function,alias.default,forward,27,1,1,2,2,3200,2
+3081,mul_190,call_function,mul.Tensor,forward,27,2,2,1,2990,3198,8
+3082,convert_element_type_649,call_function,convert_element_type.default,forward,27,1,1,1,2991,3197,6
+3083,dtype_cast_245,call_function,dtype_cast.default,forward,27,1,1,1,1,3184,3
+3084,permute_297,call_function,permute.default,forward,27,1,1,1,2,3183,3
+3085,alias_default_764,call_function,alias.default,forward,27,1,1,6,2992,3196,4
+3086,alias_default_765,call_function,alias.default,forward,27,1,1,2,3,3182,3
+3087,einsum_default_189,call_function,einsum.default,forward,27,2,2,1,2997,3180,5
+3088,dtype_cast_246,call_function,dtype_cast.default,forward,27,1,1,1,1,3184,3
+3089,permute_298,call_function,permute.default,forward,27,1,1,1,2,3183,3
+3090,alias_default_766,call_function,alias.default,forward,27,1,1,2,3,3182,3
+3091,einsum_default_190,call_function,einsum.default,forward,27,2,2,1,2997,3180,5
+3092,dtype_cast_247,call_function,dtype_cast.default,forward,27,1,1,1,1,3177,3
+3093,permute_299,call_function,permute.default,forward,27,1,1,1,2,3176,3
+3094,alias_default_767,call_function,alias.default,forward,27,1,1,2,3,3175,3
+3095,einsum_default_191,call_function,einsum.default,forward,27,2,2,1,2997,3173,5
+3096,view_627,call_function,view.default,forward,27,1,1,1,2998,3179,4
+3097,view_628,call_function,view.default,forward,27,1,1,1,2998,3179,4
+3098,view_629,call_function,view.default,forward,27,1,1,1,2998,3172,4
+3099,convert_element_type_656,call_function,convert_element_type.default,forward,27,1,1,1,2999,3178,4
+3100,view_630,call_function,view.default,forward,27,1,1,1,3000,3177,4
+3101,view_as_complex_54,call_function,view_as_complex.default,forward,27,1,1,1,3001,3176,6
+3102,convert_element_type_657,call_function,convert_element_type.default,forward,27,1,1,1,2999,3178,4
+3103,view_631,call_function,view.default,forward,27,1,1,1,3000,3177,4
+3104,view_as_complex_55,call_function,view_as_complex.default,forward,27,1,1,1,3001,3176,6
+3105,view_632,call_function,view.default,forward,27,1,1,1,2,3187,3
+3106,alias_default_768,call_function,alias.default,forward,27,1,1,4,3,3186,3
+3107,mul_191,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8
+3108,view_as_real_54,call_function,view_as_real.default,forward,27,1,1,1,3005,3174,6
+3109,view_633,call_function,view.default,forward,27,1,1,1,3006,3173,6
+3110,mul_192,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8
+3111,view_as_real_55,call_function,view_as_real.default,forward,27,1,1,1,3005,3174,6
+3112,view_634,call_function,view.default,forward,27,1,1,1,3006,3173,6
+3113,convert_element_type_658,call_function,convert_element_type.default,forward,27,1,1,1,3007,3172,6
+3114,convert_element_type_659,call_function,convert_element_type.default,forward,27,1,1,1,3007,3172,6
+3115,permute_300,call_function,permute.default,forward,27,1,1,1,3008,3171,6
+3116,permute_301,call_function,permute.default,forward,27,1,1,1,3008,3171,6
+3117,permute_302,call_function,permute.default,forward,27,1,1,1,2999,3171,4
+3118,alias_default_769,call_function,alias.default,forward,27,1,1,2,3009,3170,4
+3119,alias_default_770,call_function,alias.default,forward,27,1,1,2,3009,3170,4
+3120,alias_default_771,call_function,alias.default,forward,27,1,1,2,3000,3170,4
+3121,_scaled_dot_product_flash_attention_27,call_function,_scaled_dot_product_flash_attention.default,forward,27,3,3,4,3033,3169,2
+3122,getitem_243,call_function,getitem,forward,27,1,1,1,3034,3165,2
+3123,getitem_244,call_function,getitem,forward,27,1,1,1,3034,3034,2
+3124,getitem_249,call_function,getitem,forward,27,1,1,1,3034,3034,1
+3125,getitem_250,call_function,getitem,forward,27,1,1,1,3034,3034,1
+3126,alias_default_772,call_function,alias.default,forward,27,1,1,2,3035,3164,4
+3127,permute_303,call_function,permute.default,forward,27,1,1,1,3036,3163,4
+3128,view_635,call_function,view.default,forward,27,1,1,1,3037,3162,3
+3129,dtype_cast_248,call_function,dtype_cast.default,forward,27,1,1,1,1,3164,3
+3130,permute_304,call_function,permute.default,forward,27,1,1,1,2,3163,3
+3131,alias_default_773,call_function,alias.default,forward,27,1,1,2,3038,3161,4
+3132,alias_default_774,call_function,alias.default,forward,27,1,1,2,3,3162,3
+3133,einsum_default_192,call_function,einsum.default,forward,27,2,2,1,3043,3160,5
+3134,add_136,call_function,add.Tensor,forward,27,2,2,1,3044,3159,10
+3135,dtype_cast_249,call_function,dtype_cast.default,forward,27,1,1,1,1,3148,2
+3136,alias_default_775,call_function,alias.default,forward,27,1,1,3,3045,3158,4
+3137,convert_element_type_662,call_function,convert_element_type.default,forward,27,1,1,1,3046,3156,4
+3138,alias_default_777,call_function,alias.default,forward,27,1,1,2,3047,3155,4
+3139,pow_56,call_function,pow.Tensor_Scalar,forward,27,1,1,1,3048,3154,4
+3140,mean_55,call_function,mean.dim,forward,27,1,1,1,3049,3153,4
+3141,add_137,call_function,add.Scalar,forward,27,1,1,1,3050,3152,3
+3142,rsqrt_55,call_function,rsqrt.default,forward,27,1,1,1,3051,3151,3
+3143,alias_default_778,call_function,alias.default,forward,27,1,1,3,3052,3150,3
+3144,mul_193,call_function,mul.Tensor,forward,27,2,2,1,3053,3146,8
+3145,alias_default_776,call_function,alias.default,forward,27,1,1,2,2,3147,2
+3146,mul_194,call_function,mul.Tensor,forward,27,2,2,1,3057,3145,8
+3147,convert_element_type_663,call_function,convert_element_type.default,forward,27,1,1,1,3058,3144,6
+3148,dtype_cast_250,call_function,dtype_cast.default,forward,27,1,1,1,1,3144,3
+3149,permute_305,call_function,permute.default,forward,27,1,1,1,2,3143,3
+3150,alias_default_779,call_function,alias.default,forward,27,1,1,4,3059,3143,4
+3151,alias_default_780,call_function,alias.default,forward,27,1,1,2,3,3142,3
+3152,einsum_default_193,call_function,einsum.default,forward,27,2,2,1,3064,3140,5
+3153,alias_default_781,call_function,alias.default,forward,27,1,1,2,3065,3139,4
+3154,convert_element_type_666,call_function,convert_element_type.default,forward,27,1,1,1,3066,3127,4
+3155,alias_default_782,call_function,alias.default,forward,27,1,1,2,3067,3126,4
+3156,neg_27,call_function,neg.default,forward,27,1,1,1,3068,3125,8
+3157,exp_27,call_function,exp.default,forward,27,1,1,1,3069,3124,6
+3158,add_138,call_function,add.Tensor,forward,27,1,1,1,3070,3123,4
+3159,div_27,call_function,div.Tensor,forward,27,2,2,1,3071,3122,6
+3160,convert_element_type_667,call_function,convert_element_type.default,forward,27,1,1,1,3072,3121,6
+3161,dtype_cast_251,call_function,dtype_cast.default,forward,27,1,1,1,1,3125,3
+3162,permute_306,call_function,permute.default,forward,27,1,1,1,2,3124,3
+3163,alias_default_784,call_function,alias.default,forward,27,1,1,2,3,3123,3
+3164,einsum_default_194,call_function,einsum.default,forward,27,2,2,1,3064,3121,5
+3165,alias_default_783,call_function,alias.default,forward,27,1,1,2,3073,3120,4
+3166,alias_default_785,call_function,alias.default,forward,27,1,1,2,3065,3120,4
+3167,mul_195,call_function,mul.Tensor,forward,27,2,2,1,3080,3119,8
+3168,dtype_cast_252,call_function,dtype_cast.default,forward,27,1,1,1,1,3121,3
+3169,permute_307,call_function,permute.default,forward,27,1,1,1,2,3120,3
+3170,alias_default_786,call_function,alias.default,forward,27,1,1,2,3081,3118,4
+3171,alias_default_787,call_function,alias.default,forward,27,1,1,2,3,3119,3
+3172,einsum_default_195,call_function,einsum.default,forward,27,2,2,1,3086,3117,5
+3173,add_139,call_function,add.Tensor,forward,27,2,2,1,3087,3116,10
+3174,dtype_cast_253,call_function,dtype_cast.default,forward,,1,1,1,1,3102,2
+3175,alias_default_788,call_function,alias.default,forward,27,1,1,2,3088,3115,4
+3176,convert_element_type_672,call_function,convert_element_type.default,forward,,1,1,1,3089,3113,4
+3177,alias_default_790,call_function,alias.default,forward,,1,1,2,3090,3112,4
+3178,pow_57,call_function,pow.Tensor_Scalar,forward,,1,1,1,3091,3111,4
+3179,mean_56,call_function,mean.dim,forward,,1,1,1,3092,3110,4
+3180,add_140,call_function,add.Scalar,forward,,1,1,1,3093,3109,3
+3181,rsqrt_56,call_function,rsqrt.default,forward,,1,1,1,3094,3108,3
+3182,alias_default_791,call_function,alias.default,forward,,1,1,3,3095,3107,3
+3183,mul_196,call_function,mul.Tensor,forward,,2,2,1,3096,10,8
+3184,alias_default_789,call_function,alias.default,forward,,1,1,2,2,3101,2
+3185,mul_197,call_function,mul.Tensor,forward,,2,2,1,3100,9,8
+3186,convert_element_type_673,call_function,convert_element_type.default,forward,,1,1,1,3101,8,6
+3187,dtype_cast_254,call_function,dtype_cast.default,forward,,1,1,1,2,3105,3
+3188,permute_308,call_function,permute.default,forward,,1,1,1,3,3104,3
+3189,alias_default_792,call_function,alias.default,forward,,1,1,2,3102,7,4
+3190,alias_default_793,call_function,alias.default,forward,,1,1,2,4,3103,3
+3191,einsum_default_196,call_function,einsum.default,forward,,2,2,1,3106,1,5
+3192,alias_default_1245,call_function,alias.default,forward,,1,1,0,3107,0,4
+3193,alias_default_3,call_function,alias.default,unknown,,1,1,2,1,3103,4
+3194,einsum_default_197,call_function,einsum.default,backward,,2,2,1,3105,4,5
+3195,permute_311,call_function,permute.default,backward,,1,1,1,5,3100,3
+3196,einsum_default_198,call_function,einsum.default,backward,,2,2,1,8,3099,5
+3197,permute_312,call_function,permute.default,backward,,1,1,1,3106,3,4
+3198,dtype_cast_255,call_function,dtype_cast.default,backward,,1,1,1,3107,2,4
+3199,convert_element_type_680,call_function,convert_element_type.default,backward,,1,1,1,9,3098,5
+3200,convert_element_type_681,call_function,convert_element_type.default,backward,,1,1,1,3089,3098,4
+3201,convert_element_type_682,call_function,convert_element_type.default,backward,,1,1,1,3,3092,2
+3202,alias_default_794,call_function,alias.default,backward,,1,1,2,10,3097,4
+3203,mul_198,call_function,mul.Tensor,backward,,2,2,1,15,3091,8
+3204,mul_199,call_function,mul.Tensor,backward,,2,2,1,3097,3097,8
+3205,alias_default_795,call_function,alias.default,backward,,1,1,2,16,3090,4
+3206,alias_default_796,call_function,alias.default,backward,,1,1,3,3098,3096,4
+3207,mul_200,call_function,mul.Tensor,backward,,2,2,1,3114,3089,8
+3208,sum_1,call_function,sum.dim_IntList,backward,,1,1,1,3115,3088,5
+3209,div_28,call_function,div.Tensor,backward,,1,1,1,3099,3088,6
+3210,mul_201,call_function,mul.Tensor,backward,,2,2,1,3117,3087,8
+3211,sub,call_function,sub.Tensor,backward,,2,2,1,3118,3086,10
+3212,mul_202,call_function,mul.Tensor,backward,,2,2,1,3119,3085,8
+3213,mul_203,call_function,mul.Tensor,backward,,2,2,1,3108,4,8
+3214,sum_2,call_function,sum.dim_IntList,backward,,1,1,1,3109,3,5
+3215,convert_element_type_683,call_function,convert_element_type.default,backward,,1,1,1,3120,3084,6
+3216,convert_element_type_684,call_function,convert_element_type.default,backward,,1,1,1,3110,2,3
+3217,dtype_cast_256,call_function,dtype_cast.default,backward,,1,1,1,3111,1,3
+3218,alias_default_1499,call_function,alias.default,backward,,1,1,0,3112,0,2
+3219,alias_default_797,call_function,alias.default,backward,,1,1,3,3121,3083,4
+3220,einsum_default_199,call_function,einsum.default,backward,27,2,2,1,3122,3,5
+3221,permute_315,call_function,permute.default,backward,27,1,1,1,4,3079,3
+3222,einsum_default_200,call_function,einsum.default,backward,27,2,2,1,3123,3078,5
+3223,permute_316,call_function,permute.default,backward,27,1,1,1,3123,2,4
+3224,dtype_cast_257,call_function,dtype_cast.default,backward,27,1,1,1,3124,1,4
+3225,alias_default_1495,call_function,alias.default,backward,27,1,1,0,3125,0,3
+3226,alias_default_798,call_function,alias.default,backward,27,1,1,2,3124,3077,4
+3227,mul_204,call_function,mul.Tensor,backward,27,2,2,1,3125,3065,8
+3228,mul_205,call_function,mul.Tensor,backward,27,2,2,1,3125,3069,8
+3229,alias_default_799,call_function,alias.default,backward,27,1,1,2,3126,3064,4
+3230,einsum_default_201,call_function,einsum.default,backward,27,2,2,1,3127,3,5
+3231,permute_319,call_function,permute.default,backward,27,1,1,1,4,3060,3
+3232,einsum_default_202,call_function,einsum.default,backward,27,2,2,1,3128,3059,5
+3233,permute_320,call_function,permute.default,backward,27,1,1,1,3128,2,4
+3234,dtype_cast_258,call_function,dtype_cast.default,backward,27,1,1,1,3129,1,4
+3235,alias_default_1496,call_function,alias.default,backward,27,1,1,0,3130,0,3
+3236,convert_element_type_693,call_function,convert_element_type.default,backward,27,1,1,1,3126,3068,6
+3237,convert_element_type_694,call_function,convert_element_type.default,backward,27,1,1,1,3066,3078,4
+3238,alias_default_800,call_function,alias.default,backward,27,1,1,2,3067,3077,4
+3239,neg_28,call_function,neg.default,backward,27,1,1,1,3068,3076,8
+3240,exp_28,call_function,exp.default,backward,27,1,1,1,3069,3075,6
+3241,add_141,call_function,add.Tensor,backward,27,1,1,1,3070,3074,4
+3242,reciprocal,call_function,reciprocal.default,backward,27,1,1,1,3071,3073,4
+3243,mul_206,call_function,mul.Tensor,backward,27,1,1,1,3072,3072,6
+3244,alias_default_801,call_function,alias.default,backward,27,1,1,2,3073,3071,4
+3245,mul_207,call_function,mul.Tensor,backward,27,2,2,1,3135,3067,8
+3246,sub_1,call_function,sub.Tensor,backward,27,1,1,1,3074,3069,4
+3247,mul_208,call_function,mul.Tensor,backward,27,2,2,1,3075,3068,8
+3248,add_142,call_function,add.Tensor,backward,27,1,1,1,3076,3067,4
+3249,mul_209,call_function,mul.Tensor,backward,27,2,2,1,3139,3066,8
+3250,convert_element_type_695,call_function,convert_element_type.default,backward,27,1,1,1,3140,3065,6
+3251,alias_default_802,call_function,alias.default,backward,27,1,1,2,3141,3064,4
+3252,einsum_default_203,call_function,einsum.default,backward,27,2,2,1,3142,3,5
+3253,permute_323,call_function,permute.default,backward,27,1,1,1,4,3060,3
+3254,einsum_default_204,call_function,einsum.default,backward,27,2,2,1,3143,3059,5
+3255,add_143,call_function,add.Tensor,unknown,,2,2,1,3148,3058,10
+3256,permute_324,call_function,permute.default,backward,27,1,1,1,3143,2,4
+3257,dtype_cast_259,call_function,dtype_cast.default,backward,27,1,1,1,3144,1,4
+3258,alias_default_1494,call_function,alias.default,backward,27,1,1,0,3145,0,3
+3259,convert_element_type_700,call_function,convert_element_type.default,backward,27,1,1,1,3149,3057,8
+3260,convert_element_type_701,call_function,convert_element_type.default,backward,27,1,1,1,3046,3057,4
+3261,convert_element_type_702,call_function,convert_element_type.default,backward,27,1,1,1,3,3051,2
+3262,alias_default_803,call_function,alias.default,backward,27,1,1,2,3150,3056,4
+3263,mul_210,call_function,mul.Tensor,backward,27,2,2,1,3152,3050,8
+3264,mul_211,call_function,mul.Tensor,backward,27,2,2,1,3054,3056,8
+3265,alias_default_804,call_function,alias.default,backward,27,1,1,2,3153,3049,4
+3266,alias_default_805,call_function,alias.default,backward,27,1,1,3,3055,3055,4
+3267,mul_212,call_function,mul.Tensor,backward,27,2,2,1,3157,3048,8
+3268,sum_3,call_function,sum.dim_IntList,backward,27,1,1,1,3158,3047,5
+3269,div_29,call_function,div.Tensor,backward,27,1,1,1,3056,3047,6
+3270,mul_213,call_function,mul.Tensor,backward,27,2,2,1,3160,3046,8
+3271,sub_2,call_function,sub.Tensor,backward,27,2,2,1,3161,3045,10
+3272,mul_214,call_function,mul.Tensor,backward,27,2,2,1,3162,3044,8
+3273,mul_215,call_function,mul.Tensor,backward,27,2,2,1,3154,4,8
+3274,sum_4,call_function,sum.dim_IntList,backward,27,1,1,1,3155,3,5
+3275,convert_element_type_703,call_function,convert_element_type.default,backward,27,1,1,1,3163,3043,6
+3276,convert_element_type_704,call_function,convert_element_type.default,backward,27,1,1,1,3156,2,3
+3277,add_144,call_function,add.Tensor,unknown,,2,2,1,3164,3042,10
+3278,dtype_cast_260,call_function,dtype_cast.default,backward,27,1,1,1,3157,1,3
+3279,alias_default_1498,call_function,alias.default,backward,27,1,1,0,3158,0,2
+3280,alias_default_806,call_function,alias.default,unknown,,1,1,3,3165,3041,4
+3281,einsum_default_205,call_function,einsum.default,backward,27,2,2,1,3166,3,5
+3282,permute_327,call_function,permute.default,backward,27,1,1,1,4,3037,3
+3283,einsum_default_206,call_function,einsum.default,backward,27,2,2,1,3167,3036,5
+3284,permute_328,call_function,permute.default,backward,27,1,1,1,3167,2,4
+3285,dtype_cast_261,call_function,dtype_cast.default,backward,27,1,1,1,3168,1,4
+3286,alias_default_1493,call_function,alias.default,backward,27,1,1,0,3169,0,3
+3287,view_656,call_function,view.default,backward,27,1,1,1,3168,3035,4
+3288,permute_329,call_function,permute.default,backward,27,1,1,1,3169,3034,4
+3289,_scaled_dot_product_flash_attention_backward,call_function,_scaled_dot_product_flash_attention_backward.default,backward,27,8,8,3,3173,3033,2
+3290,getitem_252,call_function,getitem,backward,27,1,1,1,3174,3006,2
+3291,getitem_253,call_function,getitem,backward,27,1,1,1,3174,3007,2
+3292,getitem_254,call_function,getitem,backward,27,1,1,1,3174,3000,2
+3293,permute_330,call_function,permute.default,backward,27,1,1,1,3175,2999,2
+3294,permute_331,call_function,permute.default,backward,27,1,1,1,3175,3006,2
+3295,permute_332,call_function,permute.default,backward,27,1,1,1,3175,3005,2
+3296,convert_element_type_709,call_function,convert_element_type.default,backward,27,1,1,1,3176,3005,2
+3297,convert_element_type_710,call_function,convert_element_type.default,backward,27,1,1,1,3176,3004,2
+3298,view_657,call_function,view.default,backward,27,1,1,1,3177,3004,2
+3299,view_as_complex_56,call_function,view_as_complex.default,backward,27,1,1,1,3178,3003,6
+3300,_conj,call_function,_conj.default,backward,27,1,1,1,4,3004,3
+3301,clone_6,call_function,clone.default,backward,27,1,1,1,5,3003,3
+3302,mul_216,call_function,mul.Tensor,backward,27,2,2,1,3181,3002,8
+3303,view_658,call_function,view.default,backward,27,1,1,1,3177,3003,2
+3304,view_as_complex_57,call_function,view_as_complex.default,backward,27,1,1,1,3178,3002,6
+3305,_conj_1,call_function,_conj.default,backward,27,1,1,1,4,3003,3
+3306,clone_7,call_function,clone.default,backward,27,1,1,1,5,3002,3
+3307,mul_217,call_function,mul.Tensor,backward,27,2,2,1,3181,3001,8
+3308,view_as_real_56,call_function,view_as_real.default,backward,27,1,1,1,3182,3001,6
+3309,view_659,call_function,view.default,backward,27,1,1,1,3183,3000,6
+3310,convert_element_type_711,call_function,convert_element_type.default,backward,27,1,1,1,3184,2999,6
+3311,view_as_real_57,call_function,view_as_real.default,backward,27,1,1,1,3182,3000,6
+3312,view_660,call_function,view.default,backward,27,1,1,1,3183,2999,6
+3313,convert_element_type_712,call_function,convert_element_type.default,backward,27,1,1,1,3184,2998,6
+3314,view_661,call_function,view.default,backward,27,1,1,1,3176,2998,2
+3315,view_662,call_function,view.default,backward,27,1,1,1,3185,2998,5
+3316,view_663,call_function,view.default,backward,27,1,1,1,3185,2997,5
+3317,alias_default_807,call_function,alias.default,backward,27,1,1,2,3177,2997,4
+3318,einsum_default_207,call_function,einsum.default,backward,27,2,2,1,3178,3,5
+3319,permute_335,call_function,permute.default,backward,27,1,1,1,4,2993,3
+3320,einsum_default_208,call_function,einsum.default,backward,27,2,2,1,3179,2992,5
+3321,permute_336,call_function,permute.default,backward,27,1,1,1,3179,2,4
+3322,dtype_cast_262,call_function,dtype_cast.default,backward,27,1,1,1,3180,1,4
+3323,alias_default_1492,call_function,alias.default,backward,27,1,1,0,3181,0,3
+3324,alias_default_808,call_function,alias.default,backward,27,1,1,2,3186,2997,4
+3325,einsum_default_209,call_function,einsum.default,backward,27,2,2,1,3187,3,5
+3326,permute_339,call_function,permute.default,backward,27,1,1,1,4,2993,3
+3327,einsum_default_210,call_function,einsum.default,backward,27,2,2,1,3188,2992,5
+3328,add_145,call_function,add.Tensor,unknown,,2,2,1,3195,2991,10
+3329,permute_340,call_function,permute.default,backward,27,1,1,1,3188,2,4
+3330,dtype_cast_263,call_function,dtype_cast.default,backward,27,1,1,1,3189,1,4
+3331,alias_default_1491,call_function,alias.default,backward,27,1,1,0,3190,0,3
+3332,alias_default_809,call_function,alias.default,backward,27,1,1,2,3186,2996,4
+3333,einsum_default_211,call_function,einsum.default,backward,27,2,2,1,3187,3,5
+3334,permute_343,call_function,permute.default,backward,27,1,1,1,4,2992,3
+3335,einsum_default_212,call_function,einsum.default,backward,27,2,2,1,3188,2991,5
+3336,add_146,call_function,add.Tensor,unknown,,2,2,1,3211,2990,10
+3337,permute_344,call_function,permute.default,backward,27,1,1,1,3188,2,4
+3338,dtype_cast_264,call_function,dtype_cast.default,backward,27,1,1,1,3189,1,4
+3339,alias_default_1490,call_function,alias.default,backward,27,1,1,0,3190,0,3
+3340,convert_element_type_725,call_function,convert_element_type.default,backward,27,1,1,1,3212,2989,8
+3341,convert_element_type_726,call_function,convert_element_type.default,backward,27,1,1,1,2979,2989,4
+3342,convert_element_type_727,call_function,convert_element_type.default,backward,27,1,1,1,3,2983,2
+3343,alias_default_810,call_function,alias.default,backward,27,1,1,2,3213,2988,4
+3344,mul_218,call_function,mul.Tensor,backward,27,2,2,1,3215,2982,8
+3345,mul_219,call_function,mul.Tensor,backward,27,2,2,1,2987,2988,8
+3346,alias_default_811,call_function,alias.default,backward,27,1,1,2,3216,2981,4
+3347,alias_default_812,call_function,alias.default,backward,27,1,1,3,2988,2987,4
+3348,mul_220,call_function,mul.Tensor,backward,27,2,2,1,3220,2980,8
+3349,sum_5,call_function,sum.dim_IntList,backward,27,1,1,1,3221,2979,5
+3350,div_30,call_function,div.Tensor,backward,27,1,1,1,2989,2979,6
+3351,mul_221,call_function,mul.Tensor,backward,27,2,2,1,3223,2978,8
+3352,sub_3,call_function,sub.Tensor,backward,27,2,2,1,3224,2977,10
+3353,mul_222,call_function,mul.Tensor,backward,27,2,2,1,3225,2976,8
+3354,mul_223,call_function,mul.Tensor,backward,27,2,2,1,3217,4,8
+3355,sum_6,call_function,sum.dim_IntList,backward,27,1,1,1,3218,3,5
+3356,convert_element_type_728,call_function,convert_element_type.default,backward,27,1,1,1,3226,2975,6
+3357,convert_element_type_729,call_function,convert_element_type.default,backward,27,1,1,1,3219,2,3
+3358,add_147,call_function,add.Tensor,unknown,,2,2,1,3227,2974,10
+3359,dtype_cast_265,call_function,dtype_cast.default,backward,27,1,1,1,3220,1,3
+3360,alias_default_1497,call_function,alias.default,backward,27,1,1,0,3221,0,2
+3361,alias_default_813,call_function,alias.default,unknown,,1,1,3,3228,2973,4
+3362,einsum_default_213,call_function,einsum.default,backward,26,2,2,1,3229,3,5
+3363,permute_347,call_function,permute.default,backward,26,1,1,1,4,2969,3
+3364,einsum_default_214,call_function,einsum.default,backward,26,2,2,1,3230,2968,5
+3365,permute_348,call_function,permute.default,backward,26,1,1,1,3230,2,4
+3366,dtype_cast_266,call_function,dtype_cast.default,backward,26,1,1,1,3231,1,4
+3367,alias_default_1486,call_function,alias.default,backward,26,1,1,0,3232,0,3
+3368,alias_default_814,call_function,alias.default,backward,26,1,1,2,3231,2967,4
+3369,mul_224,call_function,mul.Tensor,backward,26,2,2,1,3232,2955,8
+3370,mul_225,call_function,mul.Tensor,backward,26,2,2,1,3232,2959,8
+3371,alias_default_815,call_function,alias.default,backward,26,1,1,2,3233,2954,4
+3372,einsum_default_215,call_function,einsum.default,backward,26,2,2,1,3234,3,5
+3373,permute_351,call_function,permute.default,backward,26,1,1,1,4,2950,3
+3374,einsum_default_216,call_function,einsum.default,backward,26,2,2,1,3235,2949,5
+3375,permute_352,call_function,permute.default,backward,26,1,1,1,3235,2,4
+3376,dtype_cast_267,call_function,dtype_cast.default,backward,26,1,1,1,3236,1,4
+3377,alias_default_1487,call_function,alias.default,backward,26,1,1,0,3237,0,3
+3378,convert_element_type_738,call_function,convert_element_type.default,backward,26,1,1,1,3233,2958,6
+3379,convert_element_type_739,call_function,convert_element_type.default,backward,26,1,1,1,2956,2968,4
+3380,alias_default_816,call_function,alias.default,backward,26,1,1,2,2957,2967,4
+3381,neg_29,call_function,neg.default,backward,26,1,1,1,2958,2966,8
+3382,exp_29,call_function,exp.default,backward,26,1,1,1,2959,2965,6
+3383,add_148,call_function,add.Tensor,backward,26,1,1,1,2960,2964,4
+3384,reciprocal_1,call_function,reciprocal.default,backward,26,1,1,1,2961,2963,4
+3385,mul_226,call_function,mul.Tensor,backward,26,1,1,1,2962,2962,6
+3386,alias_default_817,call_function,alias.default,backward,26,1,1,2,2963,2961,4
+3387,mul_227,call_function,mul.Tensor,backward,26,2,2,1,3242,2957,8
+3388,sub_4,call_function,sub.Tensor,backward,26,1,1,1,2964,2959,4
+3389,mul_228,call_function,mul.Tensor,backward,26,2,2,1,2965,2958,8
+3390,add_149,call_function,add.Tensor,backward,26,1,1,1,2966,2957,4
+3391,mul_229,call_function,mul.Tensor,backward,26,2,2,1,3246,2956,8
+3392,convert_element_type_740,call_function,convert_element_type.default,backward,26,1,1,1,3247,2955,6
+3393,alias_default_818,call_function,alias.default,backward,26,1,1,2,3248,2954,4
+3394,einsum_default_217,call_function,einsum.default,backward,26,2,2,1,3249,3,5
+3395,permute_355,call_function,permute.default,backward,26,1,1,1,4,2950,3
+3396,einsum_default_218,call_function,einsum.default,backward,26,2,2,1,3250,2949,5
+3397,add_150,call_function,add.Tensor,unknown,,2,2,1,3255,2948,10
+3398,permute_356,call_function,permute.default,backward,26,1,1,1,3250,2,4
+3399,dtype_cast_268,call_function,dtype_cast.default,backward,26,1,1,1,3251,1,4
+3400,alias_default_1485,call_function,alias.default,backward,26,1,1,0,3252,0,3
+3401,convert_element_type_745,call_function,convert_element_type.default,backward,26,1,1,1,3256,2947,8
+3402,convert_element_type_746,call_function,convert_element_type.default,backward,26,1,1,1,2936,2947,4
+3403,convert_element_type_747,call_function,convert_element_type.default,backward,26,1,1,1,3,2941,2
+3404,alias_default_819,call_function,alias.default,backward,26,1,1,2,3257,2946,4
+3405,mul_230,call_function,mul.Tensor,backward,26,2,2,1,3259,2940,8
+3406,mul_231,call_function,mul.Tensor,backward,26,2,2,1,2944,2946,8
+3407,alias_default_820,call_function,alias.default,backward,26,1,1,2,3260,2939,4
+3408,alias_default_821,call_function,alias.default,backward,26,1,1,3,2945,2945,4
+3409,mul_232,call_function,mul.Tensor,backward,26,2,2,1,3264,2938,8
+3410,sum_7,call_function,sum.dim_IntList,backward,26,1,1,1,3265,2937,5
+3411,div_31,call_function,div.Tensor,backward,26,1,1,1,2946,2937,6
+3412,mul_233,call_function,mul.Tensor,backward,26,2,2,1,3267,2936,8
+3413,sub_5,call_function,sub.Tensor,backward,26,2,2,1,3268,2935,10
+3414,mul_234,call_function,mul.Tensor,backward,26,2,2,1,3269,2934,8
+3415,mul_235,call_function,mul.Tensor,backward,26,2,2,1,3261,4,8
+3416,sum_8,call_function,sum.dim_IntList,backward,26,1,1,1,3262,3,5
+3417,convert_element_type_748,call_function,convert_element_type.default,backward,26,1,1,1,3270,2933,6
+3418,convert_element_type_749,call_function,convert_element_type.default,backward,26,1,1,1,3263,2,3
+3419,add_151,call_function,add.Tensor,unknown,,2,2,1,3271,2932,10
+3420,dtype_cast_269,call_function,dtype_cast.default,backward,26,1,1,1,3264,1,3
+3421,alias_default_1489,call_function,alias.default,backward,26,1,1,0,3265,0,2
+3422,alias_default_822,call_function,alias.default,unknown,,1,1,3,3272,2931,4
+3423,einsum_default_219,call_function,einsum.default,backward,26,2,2,1,3273,3,5
+3424,permute_359,call_function,permute.default,backward,26,1,1,1,4,2927,3
+3425,einsum_default_220,call_function,einsum.default,backward,26,2,2,1,3274,2926,5
+3426,permute_360,call_function,permute.default,backward,26,1,1,1,3274,2,4
+3427,dtype_cast_270,call_function,dtype_cast.default,backward,26,1,1,1,3275,1,4
+3428,alias_default_1484,call_function,alias.default,backward,26,1,1,0,3276,0,3
+3429,view_678,call_function,view.default,backward,26,1,1,1,3275,2925,4
+3430,permute_361,call_function,permute.default,backward,26,1,1,1,3276,2924,4
+3431,_scaled_dot_product_flash_attention_backward_1,call_function,_scaled_dot_product_flash_attention_backward.default,backward,26,8,8,3,3280,2923,2
+3432,getitem_255,call_function,getitem,backward,26,1,1,1,3281,2896,2
+3433,getitem_256,call_function,getitem,backward,26,1,1,1,3281,2897,2
+3434,getitem_257,call_function,getitem,backward,26,1,1,1,3281,2890,2
+3435,permute_362,call_function,permute.default,backward,26,1,1,1,3282,2889,2
+3436,permute_363,call_function,permute.default,backward,26,1,1,1,3282,2896,2
+3437,permute_364,call_function,permute.default,backward,26,1,1,1,3282,2895,2
+3438,convert_element_type_754,call_function,convert_element_type.default,backward,26,1,1,1,3283,2895,2
+3439,convert_element_type_755,call_function,convert_element_type.default,backward,26,1,1,1,3283,2894,2
+3440,view_679,call_function,view.default,backward,26,1,1,1,3284,2894,2
+3441,view_as_complex_58,call_function,view_as_complex.default,backward,26,1,1,1,3285,2893,6
+3442,_conj_2,call_function,_conj.default,backward,26,1,1,1,4,2894,3
+3443,clone_14,call_function,clone.default,backward,26,1,1,1,5,2893,3
+3444,mul_236,call_function,mul.Tensor,backward,26,2,2,1,3288,2892,8
+3445,view_680,call_function,view.default,backward,26,1,1,1,3284,2893,2
+3446,view_as_complex_59,call_function,view_as_complex.default,backward,26,1,1,1,3285,2892,6
+3447,_conj_3,call_function,_conj.default,backward,26,1,1,1,4,2893,3
+3448,clone_15,call_function,clone.default,backward,26,1,1,1,5,2892,3
+3449,mul_237,call_function,mul.Tensor,backward,26,2,2,1,3288,2891,8
+3450,view_as_real_58,call_function,view_as_real.default,backward,26,1,1,1,3289,2891,6
+3451,view_681,call_function,view.default,backward,26,1,1,1,3290,2890,6
+3452,convert_element_type_756,call_function,convert_element_type.default,backward,26,1,1,1,3291,2889,6
+3453,view_as_real_59,call_function,view_as_real.default,backward,26,1,1,1,3289,2890,6
+3454,view_682,call_function,view.default,backward,26,1,1,1,3290,2889,6
+3455,convert_element_type_757,call_function,convert_element_type.default,backward,26,1,1,1,3291,2888,6
+3456,view_683,call_function,view.default,backward,26,1,1,1,3283,2888,2
+3457,view_684,call_function,view.default,backward,26,1,1,1,3292,2888,5
+3458,view_685,call_function,view.default,backward,26,1,1,1,3292,2887,5
+3459,alias_default_823,call_function,alias.default,backward,26,1,1,2,3284,2887,4
+3460,einsum_default_221,call_function,einsum.default,backward,26,2,2,1,3285,3,5
+3461,permute_367,call_function,permute.default,backward,26,1,1,1,4,2883,3
+3462,einsum_default_222,call_function,einsum.default,backward,26,2,2,1,3286,2882,5
+3463,permute_368,call_function,permute.default,backward,26,1,1,1,3286,2,4
+3464,dtype_cast_271,call_function,dtype_cast.default,backward,26,1,1,1,3287,1,4
+3465,alias_default_1483,call_function,alias.default,backward,26,1,1,0,3288,0,3
+3466,alias_default_824,call_function,alias.default,backward,26,1,1,2,3293,2887,4
+3467,einsum_default_223,call_function,einsum.default,backward,26,2,2,1,3294,3,5
+3468,permute_371,call_function,permute.default,backward,26,1,1,1,4,2883,3
+3469,einsum_default_224,call_function,einsum.default,backward,26,2,2,1,3295,2882,5
+3470,add_152,call_function,add.Tensor,unknown,,2,2,1,3302,2881,10
+3471,permute_372,call_function,permute.default,backward,26,1,1,1,3295,2,4
+3472,dtype_cast_272,call_function,dtype_cast.default,backward,26,1,1,1,3296,1,4
+3473,alias_default_1482,call_function,alias.default,backward,26,1,1,0,3297,0,3
+3474,alias_default_825,call_function,alias.default,backward,26,1,1,2,3293,2886,4
+3475,einsum_default_225,call_function,einsum.default,backward,26,2,2,1,3294,3,5
+3476,permute_375,call_function,permute.default,backward,26,1,1,1,4,2882,3
+3477,einsum_default_226,call_function,einsum.default,backward,26,2,2,1,3295,2881,5
+3478,add_153,call_function,add.Tensor,unknown,,2,2,1,3318,2880,10
+3479,permute_376,call_function,permute.default,backward,26,1,1,1,3295,2,4
+3480,dtype_cast_273,call_function,dtype_cast.default,backward,26,1,1,1,3296,1,4
+3481,alias_default_1481,call_function,alias.default,backward,26,1,1,0,3297,0,3
+3482,convert_element_type_770,call_function,convert_element_type.default,backward,26,1,1,1,3319,2879,8
+3483,convert_element_type_771,call_function,convert_element_type.default,backward,26,1,1,1,2869,2879,4
+3484,convert_element_type_772,call_function,convert_element_type.default,backward,26,1,1,1,3,2873,2
+3485,alias_default_826,call_function,alias.default,backward,26,1,1,2,3320,2878,4
+3486,mul_238,call_function,mul.Tensor,backward,26,2,2,1,3322,2872,8
+3487,mul_239,call_function,mul.Tensor,backward,26,2,2,1,2877,2878,8
+3488,alias_default_827,call_function,alias.default,backward,26,1,1,2,3323,2871,4
+3489,alias_default_828,call_function,alias.default,backward,26,1,1,3,2878,2877,4
+3490,mul_240,call_function,mul.Tensor,backward,26,2,2,1,3327,2870,8
+3491,sum_9,call_function,sum.dim_IntList,backward,26,1,1,1,3328,2869,5
+3492,div_32,call_function,div.Tensor,backward,26,1,1,1,2879,2869,6
+3493,mul_241,call_function,mul.Tensor,backward,26,2,2,1,3330,2868,8
+3494,sub_6,call_function,sub.Tensor,backward,26,2,2,1,3331,2867,10
+3495,mul_242,call_function,mul.Tensor,backward,26,2,2,1,3332,2866,8
+3496,mul_243,call_function,mul.Tensor,backward,26,2,2,1,3324,4,8
+3497,sum_10,call_function,sum.dim_IntList,backward,26,1,1,1,3325,3,5
+3498,convert_element_type_773,call_function,convert_element_type.default,backward,26,1,1,1,3333,2865,6
+3499,convert_element_type_774,call_function,convert_element_type.default,backward,26,1,1,1,3326,2,3
+3500,add_154,call_function,add.Tensor,unknown,,2,2,1,3334,2864,10
+3501,dtype_cast_274,call_function,dtype_cast.default,backward,26,1,1,1,3327,1,3
+3502,alias_default_1488,call_function,alias.default,backward,26,1,1,0,3328,0,2
+3503,alias_default_829,call_function,alias.default,unknown,,1,1,3,3335,2863,4
+3504,einsum_default_227,call_function,einsum.default,backward,25,2,2,1,3336,3,5
+3505,permute_379,call_function,permute.default,backward,25,1,1,1,4,2859,3
+3506,einsum_default_228,call_function,einsum.default,backward,25,2,2,1,3337,2858,5
+3507,permute_380,call_function,permute.default,backward,25,1,1,1,3337,2,4
+3508,dtype_cast_275,call_function,dtype_cast.default,backward,25,1,1,1,3338,1,4
+3509,alias_default_1477,call_function,alias.default,backward,25,1,1,0,3339,0,3
+3510,alias_default_830,call_function,alias.default,backward,25,1,1,2,3338,2857,4
+3511,mul_244,call_function,mul.Tensor,backward,25,2,2,1,3339,2845,8
+3512,mul_245,call_function,mul.Tensor,backward,25,2,2,1,3339,2849,8
+3513,alias_default_831,call_function,alias.default,backward,25,1,1,2,3340,2844,4
+3514,einsum_default_229,call_function,einsum.default,backward,25,2,2,1,3341,3,5
+3515,permute_383,call_function,permute.default,backward,25,1,1,1,4,2840,3
+3516,einsum_default_230,call_function,einsum.default,backward,25,2,2,1,3342,2839,5
+3517,permute_384,call_function,permute.default,backward,25,1,1,1,3342,2,4
+3518,dtype_cast_276,call_function,dtype_cast.default,backward,25,1,1,1,3343,1,4
+3519,alias_default_1478,call_function,alias.default,backward,25,1,1,0,3344,0,3
+3520,convert_element_type_783,call_function,convert_element_type.default,backward,25,1,1,1,3340,2848,6
+3521,convert_element_type_784,call_function,convert_element_type.default,backward,25,1,1,1,2846,2858,4
+3522,alias_default_832,call_function,alias.default,backward,25,1,1,2,2847,2857,4
+3523,neg_30,call_function,neg.default,backward,25,1,1,1,2848,2856,8
+3524,exp_30,call_function,exp.default,backward,25,1,1,1,2849,2855,6
+3525,add_155,call_function,add.Tensor,backward,25,1,1,1,2850,2854,4
+3526,reciprocal_2,call_function,reciprocal.default,backward,25,1,1,1,2851,2853,4
+3527,mul_246,call_function,mul.Tensor,backward,25,1,1,1,2852,2852,6
+3528,alias_default_833,call_function,alias.default,backward,25,1,1,2,2853,2851,4
+3529,mul_247,call_function,mul.Tensor,backward,25,2,2,1,3349,2847,8
+3530,sub_7,call_function,sub.Tensor,backward,25,1,1,1,2854,2849,4
+3531,mul_248,call_function,mul.Tensor,backward,25,2,2,1,2855,2848,8
+3532,add_156,call_function,add.Tensor,backward,25,1,1,1,2856,2847,4
+3533,mul_249,call_function,mul.Tensor,backward,25,2,2,1,3353,2846,8
+3534,convert_element_type_785,call_function,convert_element_type.default,backward,25,1,1,1,3354,2845,6
+3535,alias_default_834,call_function,alias.default,backward,25,1,1,2,3355,2844,4
+3536,einsum_default_231,call_function,einsum.default,backward,25,2,2,1,3356,3,5
+3537,permute_387,call_function,permute.default,backward,25,1,1,1,4,2840,3
+3538,einsum_default_232,call_function,einsum.default,backward,25,2,2,1,3357,2839,5
+3539,add_157,call_function,add.Tensor,unknown,,2,2,1,3362,2838,10
+3540,permute_388,call_function,permute.default,backward,25,1,1,1,3357,2,4
+3541,dtype_cast_277,call_function,dtype_cast.default,backward,25,1,1,1,3358,1,4
+3542,alias_default_1476,call_function,alias.default,backward,25,1,1,0,3359,0,3
+3543,convert_element_type_790,call_function,convert_element_type.default,backward,25,1,1,1,3363,2837,8
+3544,convert_element_type_791,call_function,convert_element_type.default,backward,25,1,1,1,2826,2837,4
+3545,convert_element_type_792,call_function,convert_element_type.default,backward,25,1,1,1,3,2831,2
+3546,alias_default_835,call_function,alias.default,backward,25,1,1,2,3364,2836,4
+3547,mul_250,call_function,mul.Tensor,backward,25,2,2,1,3366,2830,8
+3548,mul_251,call_function,mul.Tensor,backward,25,2,2,1,2834,2836,8
+3549,alias_default_836,call_function,alias.default,backward,25,1,1,2,3367,2829,4
+3550,alias_default_837,call_function,alias.default,backward,25,1,1,3,2835,2835,4
+3551,mul_252,call_function,mul.Tensor,backward,25,2,2,1,3371,2828,8
+3552,sum_11,call_function,sum.dim_IntList,backward,25,1,1,1,3372,2827,5
+3553,div_33,call_function,div.Tensor,backward,25,1,1,1,2836,2827,6
+3554,mul_253,call_function,mul.Tensor,backward,25,2,2,1,3374,2826,8
+3555,sub_8,call_function,sub.Tensor,backward,25,2,2,1,3375,2825,10
+3556,mul_254,call_function,mul.Tensor,backward,25,2,2,1,3376,2824,8
+3557,mul_255,call_function,mul.Tensor,backward,25,2,2,1,3368,4,8
+3558,sum_12,call_function,sum.dim_IntList,backward,25,1,1,1,3369,3,5
+3559,convert_element_type_793,call_function,convert_element_type.default,backward,25,1,1,1,3377,2823,6
+3560,convert_element_type_794,call_function,convert_element_type.default,backward,25,1,1,1,3370,2,3
+3561,add_158,call_function,add.Tensor,unknown,,2,2,1,3378,2822,10
+3562,dtype_cast_278,call_function,dtype_cast.default,backward,25,1,1,1,3371,1,3
+3563,alias_default_1480,call_function,alias.default,backward,25,1,1,0,3372,0,2
+3564,alias_default_838,call_function,alias.default,unknown,,1,1,3,3379,2821,4
+3565,einsum_default_233,call_function,einsum.default,backward,25,2,2,1,3380,3,5
+3566,permute_391,call_function,permute.default,backward,25,1,1,1,4,2817,3
+3567,einsum_default_234,call_function,einsum.default,backward,25,2,2,1,3381,2816,5
+3568,permute_392,call_function,permute.default,backward,25,1,1,1,3381,2,4
+3569,dtype_cast_279,call_function,dtype_cast.default,backward,25,1,1,1,3382,1,4
+3570,alias_default_1475,call_function,alias.default,backward,25,1,1,0,3383,0,3
+3571,view_700,call_function,view.default,backward,25,1,1,1,3382,2815,4
+3572,permute_393,call_function,permute.default,backward,25,1,1,1,3383,2814,4
+3573,_scaled_dot_product_flash_attention_backward_2,call_function,_scaled_dot_product_flash_attention_backward.default,backward,25,8,8,3,3387,2813,2
+3574,getitem_258,call_function,getitem,backward,25,1,1,1,3388,2786,2
+3575,getitem_259,call_function,getitem,backward,25,1,1,1,3388,2787,2
+3576,getitem_260,call_function,getitem,backward,25,1,1,1,3388,2780,2
+3577,permute_394,call_function,permute.default,backward,25,1,1,1,3389,2779,2
+3578,permute_395,call_function,permute.default,backward,25,1,1,1,3389,2786,2
+3579,permute_396,call_function,permute.default,backward,25,1,1,1,3389,2785,2
+3580,convert_element_type_799,call_function,convert_element_type.default,backward,25,1,1,1,3390,2785,2
+3581,convert_element_type_800,call_function,convert_element_type.default,backward,25,1,1,1,3390,2784,2
+3582,view_701,call_function,view.default,backward,25,1,1,1,3391,2784,2
+3583,view_as_complex_60,call_function,view_as_complex.default,backward,25,1,1,1,3392,2783,6
+3584,_conj_4,call_function,_conj.default,backward,25,1,1,1,4,2784,3
+3585,clone_22,call_function,clone.default,backward,25,1,1,1,5,2783,3
+3586,mul_256,call_function,mul.Tensor,backward,25,2,2,1,3395,2782,8
+3587,view_702,call_function,view.default,backward,25,1,1,1,3391,2783,2
+3588,view_as_complex_61,call_function,view_as_complex.default,backward,25,1,1,1,3392,2782,6
+3589,_conj_5,call_function,_conj.default,backward,25,1,1,1,4,2783,3
+3590,clone_23,call_function,clone.default,backward,25,1,1,1,5,2782,3
+3591,mul_257,call_function,mul.Tensor,backward,25,2,2,1,3395,2781,8
+3592,view_as_real_60,call_function,view_as_real.default,backward,25,1,1,1,3396,2781,6
+3593,view_703,call_function,view.default,backward,25,1,1,1,3397,2780,6
+3594,convert_element_type_801,call_function,convert_element_type.default,backward,25,1,1,1,3398,2779,6
+3595,view_as_real_61,call_function,view_as_real.default,backward,25,1,1,1,3396,2780,6
+3596,view_704,call_function,view.default,backward,25,1,1,1,3397,2779,6
+3597,convert_element_type_802,call_function,convert_element_type.default,backward,25,1,1,1,3398,2778,6
+3598,view_705,call_function,view.default,backward,25,1,1,1,3390,2778,2
+3599,view_706,call_function,view.default,backward,25,1,1,1,3399,2778,5
+3600,view_707,call_function,view.default,backward,25,1,1,1,3399,2777,5
+3601,alias_default_839,call_function,alias.default,backward,25,1,1,2,3391,2777,4
+3602,einsum_default_235,call_function,einsum.default,backward,25,2,2,1,3392,3,5
+3603,permute_399,call_function,permute.default,backward,25,1,1,1,4,2773,3
+3604,einsum_default_236,call_function,einsum.default,backward,25,2,2,1,3393,2772,5
+3605,permute_400,call_function,permute.default,backward,25,1,1,1,3393,2,4
+3606,dtype_cast_280,call_function,dtype_cast.default,backward,25,1,1,1,3394,1,4
+3607,alias_default_1474,call_function,alias.default,backward,25,1,1,0,3395,0,3
+3608,alias_default_840,call_function,alias.default,backward,25,1,1,2,3400,2777,4
+3609,einsum_default_237,call_function,einsum.default,backward,25,2,2,1,3401,3,5
+3610,permute_403,call_function,permute.default,backward,25,1,1,1,4,2773,3
+3611,einsum_default_238,call_function,einsum.default,backward,25,2,2,1,3402,2772,5
+3612,add_159,call_function,add.Tensor,unknown,,2,2,1,3409,2771,10
+3613,permute_404,call_function,permute.default,backward,25,1,1,1,3402,2,4
+3614,dtype_cast_281,call_function,dtype_cast.default,backward,25,1,1,1,3403,1,4
+3615,alias_default_1473,call_function,alias.default,backward,25,1,1,0,3404,0,3
+3616,alias_default_841,call_function,alias.default,backward,25,1,1,2,3400,2776,4
+3617,einsum_default_239,call_function,einsum.default,backward,25,2,2,1,3401,3,5
+3618,permute_407,call_function,permute.default,backward,25,1,1,1,4,2772,3
+3619,einsum_default_240,call_function,einsum.default,backward,25,2,2,1,3402,2771,5
+3620,add_160,call_function,add.Tensor,unknown,,2,2,1,3425,2770,10
+3621,permute_408,call_function,permute.default,backward,25,1,1,1,3402,2,4
+3622,dtype_cast_282,call_function,dtype_cast.default,backward,25,1,1,1,3403,1,4
+3623,alias_default_1472,call_function,alias.default,backward,25,1,1,0,3404,0,3
+3624,convert_element_type_815,call_function,convert_element_type.default,backward,25,1,1,1,3426,2769,8
+3625,convert_element_type_816,call_function,convert_element_type.default,backward,25,1,1,1,2759,2769,4
+3626,convert_element_type_817,call_function,convert_element_type.default,backward,25,1,1,1,3,2763,2
+3627,alias_default_842,call_function,alias.default,backward,25,1,1,2,3427,2768,4
+3628,mul_258,call_function,mul.Tensor,backward,25,2,2,1,3429,2762,8
+3629,mul_259,call_function,mul.Tensor,backward,25,2,2,1,2767,2768,8
+3630,alias_default_843,call_function,alias.default,backward,25,1,1,2,3430,2761,4
+3631,alias_default_844,call_function,alias.default,backward,25,1,1,3,2768,2767,4
+3632,mul_260,call_function,mul.Tensor,backward,25,2,2,1,3434,2760,8
+3633,sum_13,call_function,sum.dim_IntList,backward,25,1,1,1,3435,2759,5
+3634,div_34,call_function,div.Tensor,backward,25,1,1,1,2769,2759,6
+3635,mul_261,call_function,mul.Tensor,backward,25,2,2,1,3437,2758,8
+3636,sub_9,call_function,sub.Tensor,backward,25,2,2,1,3438,2757,10
+3637,mul_262,call_function,mul.Tensor,backward,25,2,2,1,3439,2756,8
+3638,mul_263,call_function,mul.Tensor,backward,25,2,2,1,3431,4,8
+3639,sum_14,call_function,sum.dim_IntList,backward,25,1,1,1,3432,3,5
+3640,convert_element_type_818,call_function,convert_element_type.default,backward,25,1,1,1,3440,2755,6
+3641,convert_element_type_819,call_function,convert_element_type.default,backward,25,1,1,1,3433,2,3
+3642,add_161,call_function,add.Tensor,unknown,,2,2,1,3441,2754,10
+3643,dtype_cast_283,call_function,dtype_cast.default,backward,25,1,1,1,3434,1,3
+3644,alias_default_1479,call_function,alias.default,backward,25,1,1,0,3435,0,2
+3645,alias_default_845,call_function,alias.default,unknown,,1,1,3,3442,2753,4
+3646,einsum_default_241,call_function,einsum.default,backward,24,2,2,1,3443,3,5
+3647,permute_411,call_function,permute.default,backward,24,1,1,1,4,2749,3
+3648,einsum_default_242,call_function,einsum.default,backward,24,2,2,1,3444,2748,5
+3649,permute_412,call_function,permute.default,backward,24,1,1,1,3444,2,4
+3650,dtype_cast_284,call_function,dtype_cast.default,backward,24,1,1,1,3445,1,4
+3651,alias_default_1468,call_function,alias.default,backward,24,1,1,0,3446,0,3
+3652,alias_default_846,call_function,alias.default,backward,24,1,1,2,3445,2747,4
+3653,mul_264,call_function,mul.Tensor,backward,24,2,2,1,3446,2735,8
+3654,mul_265,call_function,mul.Tensor,backward,24,2,2,1,3446,2739,8
+3655,alias_default_847,call_function,alias.default,backward,24,1,1,2,3447,2734,4
+3656,einsum_default_243,call_function,einsum.default,backward,24,2,2,1,3448,3,5
+3657,permute_415,call_function,permute.default,backward,24,1,1,1,4,2730,3
+3658,einsum_default_244,call_function,einsum.default,backward,24,2,2,1,3449,2729,5
+3659,permute_416,call_function,permute.default,backward,24,1,1,1,3449,2,4
+3660,dtype_cast_285,call_function,dtype_cast.default,backward,24,1,1,1,3450,1,4
+3661,alias_default_1469,call_function,alias.default,backward,24,1,1,0,3451,0,3
+3662,convert_element_type_828,call_function,convert_element_type.default,backward,24,1,1,1,3447,2738,6
+3663,convert_element_type_829,call_function,convert_element_type.default,backward,24,1,1,1,2736,2748,4
+3664,alias_default_848,call_function,alias.default,backward,24,1,1,2,2737,2747,4
+3665,neg_31,call_function,neg.default,backward,24,1,1,1,2738,2746,8
+3666,exp_31,call_function,exp.default,backward,24,1,1,1,2739,2745,6
+3667,add_162,call_function,add.Tensor,backward,24,1,1,1,2740,2744,4
+3668,reciprocal_3,call_function,reciprocal.default,backward,24,1,1,1,2741,2743,4
+3669,mul_266,call_function,mul.Tensor,backward,24,1,1,1,2742,2742,6
+3670,alias_default_849,call_function,alias.default,backward,24,1,1,2,2743,2741,4
+3671,mul_267,call_function,mul.Tensor,backward,24,2,2,1,3456,2737,8
+3672,sub_10,call_function,sub.Tensor,backward,24,1,1,1,2744,2739,4
+3673,mul_268,call_function,mul.Tensor,backward,24,2,2,1,2745,2738,8
+3674,add_163,call_function,add.Tensor,backward,24,1,1,1,2746,2737,4
+3675,mul_269,call_function,mul.Tensor,backward,24,2,2,1,3460,2736,8
+3676,convert_element_type_830,call_function,convert_element_type.default,backward,24,1,1,1,3461,2735,6
+3677,alias_default_850,call_function,alias.default,backward,24,1,1,2,3462,2734,4
+3678,einsum_default_245,call_function,einsum.default,backward,24,2,2,1,3463,3,5
+3679,permute_419,call_function,permute.default,backward,24,1,1,1,4,2730,3
+3680,einsum_default_246,call_function,einsum.default,backward,24,2,2,1,3464,2729,5
+3681,add_164,call_function,add.Tensor,unknown,,2,2,1,3469,2728,10
+3682,permute_420,call_function,permute.default,backward,24,1,1,1,3464,2,4
+3683,dtype_cast_286,call_function,dtype_cast.default,backward,24,1,1,1,3465,1,4
+3684,alias_default_1467,call_function,alias.default,backward,24,1,1,0,3466,0,3
+3685,convert_element_type_835,call_function,convert_element_type.default,backward,24,1,1,1,3470,2727,8
+3686,convert_element_type_836,call_function,convert_element_type.default,backward,24,1,1,1,2716,2727,4
+3687,convert_element_type_837,call_function,convert_element_type.default,backward,24,1,1,1,3,2721,2
+3688,alias_default_851,call_function,alias.default,backward,24,1,1,2,3471,2726,4
+3689,mul_270,call_function,mul.Tensor,backward,24,2,2,1,3473,2720,8
+3690,mul_271,call_function,mul.Tensor,backward,24,2,2,1,2724,2726,8
+3691,alias_default_852,call_function,alias.default,backward,24,1,1,2,3474,2719,4
+3692,alias_default_853,call_function,alias.default,backward,24,1,1,3,2725,2725,4
+3693,mul_272,call_function,mul.Tensor,backward,24,2,2,1,3478,2718,8
+3694,sum_15,call_function,sum.dim_IntList,backward,24,1,1,1,3479,2717,5
+3695,div_35,call_function,div.Tensor,backward,24,1,1,1,2726,2717,6
+3696,mul_273,call_function,mul.Tensor,backward,24,2,2,1,3481,2716,8
+3697,sub_11,call_function,sub.Tensor,backward,24,2,2,1,3482,2715,10
+3698,mul_274,call_function,mul.Tensor,backward,24,2,2,1,3483,2714,8
+3699,mul_275,call_function,mul.Tensor,backward,24,2,2,1,3475,4,8
+3700,sum_16,call_function,sum.dim_IntList,backward,24,1,1,1,3476,3,5
+3701,convert_element_type_838,call_function,convert_element_type.default,backward,24,1,1,1,3484,2713,6
+3702,convert_element_type_839,call_function,convert_element_type.default,backward,24,1,1,1,3477,2,3
+3703,add_165,call_function,add.Tensor,unknown,,2,2,1,3485,2712,10
+3704,dtype_cast_287,call_function,dtype_cast.default,backward,24,1,1,1,3478,1,3
+3705,alias_default_1471,call_function,alias.default,backward,24,1,1,0,3479,0,2
+3706,alias_default_854,call_function,alias.default,unknown,,1,1,3,3486,2711,4
+3707,einsum_default_247,call_function,einsum.default,backward,24,2,2,1,3487,3,5
+3708,permute_423,call_function,permute.default,backward,24,1,1,1,4,2707,3
+3709,einsum_default_248,call_function,einsum.default,backward,24,2,2,1,3488,2706,5
+3710,permute_424,call_function,permute.default,backward,24,1,1,1,3488,2,4
+3711,dtype_cast_288,call_function,dtype_cast.default,backward,24,1,1,1,3489,1,4
+3712,alias_default_1466,call_function,alias.default,backward,24,1,1,0,3490,0,3
+3713,view_722,call_function,view.default,backward,24,1,1,1,3489,2705,4
+3714,permute_425,call_function,permute.default,backward,24,1,1,1,3490,2704,4
+3715,_scaled_dot_product_flash_attention_backward_3,call_function,_scaled_dot_product_flash_attention_backward.default,backward,24,8,8,3,3494,2703,2
+3716,getitem_261,call_function,getitem,backward,24,1,1,1,3495,2676,2
+3717,getitem_262,call_function,getitem,backward,24,1,1,1,3495,2677,2
+3718,getitem_263,call_function,getitem,backward,24,1,1,1,3495,2670,2
+3719,permute_426,call_function,permute.default,backward,24,1,1,1,3496,2669,2
+3720,permute_427,call_function,permute.default,backward,24,1,1,1,3496,2676,2
+3721,permute_428,call_function,permute.default,backward,24,1,1,1,3496,2675,2
+3722,convert_element_type_844,call_function,convert_element_type.default,backward,24,1,1,1,3497,2675,2
+3723,convert_element_type_845,call_function,convert_element_type.default,backward,24,1,1,1,3497,2674,2
+3724,view_723,call_function,view.default,backward,24,1,1,1,3498,2674,2
+3725,view_as_complex_62,call_function,view_as_complex.default,backward,24,1,1,1,3499,2673,6
+3726,_conj_6,call_function,_conj.default,backward,24,1,1,1,4,2674,3
+3727,clone_30,call_function,clone.default,backward,24,1,1,1,5,2673,3
+3728,mul_276,call_function,mul.Tensor,backward,24,2,2,1,3502,2672,8
+3729,view_724,call_function,view.default,backward,24,1,1,1,3498,2673,2
+3730,view_as_complex_63,call_function,view_as_complex.default,backward,24,1,1,1,3499,2672,6
+3731,_conj_7,call_function,_conj.default,backward,24,1,1,1,4,2673,3
+3732,clone_31,call_function,clone.default,backward,24,1,1,1,5,2672,3
+3733,mul_277,call_function,mul.Tensor,backward,24,2,2,1,3502,2671,8
+3734,view_as_real_62,call_function,view_as_real.default,backward,24,1,1,1,3503,2671,6
+3735,view_725,call_function,view.default,backward,24,1,1,1,3504,2670,6
+3736,convert_element_type_846,call_function,convert_element_type.default,backward,24,1,1,1,3505,2669,6
+3737,view_as_real_63,call_function,view_as_real.default,backward,24,1,1,1,3503,2670,6
+3738,view_726,call_function,view.default,backward,24,1,1,1,3504,2669,6
+3739,convert_element_type_847,call_function,convert_element_type.default,backward,24,1,1,1,3505,2668,6
+3740,view_727,call_function,view.default,backward,24,1,1,1,3497,2668,2
+3741,view_728,call_function,view.default,backward,24,1,1,1,3506,2668,5
+3742,view_729,call_function,view.default,backward,24,1,1,1,3506,2667,5
+3743,alias_default_855,call_function,alias.default,backward,24,1,1,2,3498,2667,4
+3744,einsum_default_249,call_function,einsum.default,backward,24,2,2,1,3499,3,5
+3745,permute_431,call_function,permute.default,backward,24,1,1,1,4,2663,3
+3746,einsum_default_250,call_function,einsum.default,backward,24,2,2,1,3500,2662,5
+3747,permute_432,call_function,permute.default,backward,24,1,1,1,3500,2,4
+3748,dtype_cast_289,call_function,dtype_cast.default,backward,24,1,1,1,3501,1,4
+3749,alias_default_1465,call_function,alias.default,backward,24,1,1,0,3502,0,3
+3750,alias_default_856,call_function,alias.default,backward,24,1,1,2,3507,2667,4
+3751,einsum_default_251,call_function,einsum.default,backward,24,2,2,1,3508,3,5
+3752,permute_435,call_function,permute.default,backward,24,1,1,1,4,2663,3
+3753,einsum_default_252,call_function,einsum.default,backward,24,2,2,1,3509,2662,5
+3754,add_166,call_function,add.Tensor,unknown,,2,2,1,3516,2661,10
+3755,permute_436,call_function,permute.default,backward,24,1,1,1,3509,2,4
+3756,dtype_cast_290,call_function,dtype_cast.default,backward,24,1,1,1,3510,1,4
+3757,alias_default_1464,call_function,alias.default,backward,24,1,1,0,3511,0,3
+3758,alias_default_857,call_function,alias.default,backward,24,1,1,2,3507,2666,4
+3759,einsum_default_253,call_function,einsum.default,backward,24,2,2,1,3508,3,5
+3760,permute_439,call_function,permute.default,backward,24,1,1,1,4,2662,3
+3761,einsum_default_254,call_function,einsum.default,backward,24,2,2,1,3509,2661,5
+3762,add_167,call_function,add.Tensor,unknown,,2,2,1,3532,2660,10
+3763,permute_440,call_function,permute.default,backward,24,1,1,1,3509,2,4
+3764,dtype_cast_291,call_function,dtype_cast.default,backward,24,1,1,1,3510,1,4
+3765,alias_default_1463,call_function,alias.default,backward,24,1,1,0,3511,0,3
+3766,convert_element_type_860,call_function,convert_element_type.default,backward,24,1,1,1,3533,2659,8
+3767,convert_element_type_861,call_function,convert_element_type.default,backward,24,1,1,1,2649,2659,4
+3768,convert_element_type_862,call_function,convert_element_type.default,backward,24,1,1,1,3,2653,2
+3769,alias_default_858,call_function,alias.default,backward,24,1,1,2,3534,2658,4
+3770,mul_278,call_function,mul.Tensor,backward,24,2,2,1,3536,2652,8
+3771,mul_279,call_function,mul.Tensor,backward,24,2,2,1,2657,2658,8
+3772,alias_default_859,call_function,alias.default,backward,24,1,1,2,3537,2651,4
+3773,alias_default_860,call_function,alias.default,backward,24,1,1,3,2658,2657,4
+3774,mul_280,call_function,mul.Tensor,backward,24,2,2,1,3541,2650,8
+3775,sum_17,call_function,sum.dim_IntList,backward,24,1,1,1,3542,2649,5
+3776,div_36,call_function,div.Tensor,backward,24,1,1,1,2659,2649,6
+3777,mul_281,call_function,mul.Tensor,backward,24,2,2,1,3544,2648,8
+3778,sub_12,call_function,sub.Tensor,backward,24,2,2,1,3545,2647,10
+3779,mul_282,call_function,mul.Tensor,backward,24,2,2,1,3546,2646,8
+3780,mul_283,call_function,mul.Tensor,backward,24,2,2,1,3538,4,8
+3781,sum_18,call_function,sum.dim_IntList,backward,24,1,1,1,3539,3,5
+3782,convert_element_type_863,call_function,convert_element_type.default,backward,24,1,1,1,3547,2645,6
+3783,convert_element_type_864,call_function,convert_element_type.default,backward,24,1,1,1,3540,2,3
+3784,add_168,call_function,add.Tensor,unknown,,2,2,1,3548,2644,10
+3785,dtype_cast_292,call_function,dtype_cast.default,backward,24,1,1,1,3541,1,3
+3786,alias_default_1470,call_function,alias.default,backward,24,1,1,0,3542,0,2
+3787,alias_default_861,call_function,alias.default,unknown,,1,1,3,3549,2643,4
+3788,einsum_default_255,call_function,einsum.default,backward,23,2,2,1,3550,3,5
+3789,permute_443,call_function,permute.default,backward,23,1,1,1,4,2639,3
+3790,einsum_default_256,call_function,einsum.default,backward,23,2,2,1,3551,2638,5
+3791,permute_444,call_function,permute.default,backward,23,1,1,1,3551,2,4
+3792,dtype_cast_293,call_function,dtype_cast.default,backward,23,1,1,1,3552,1,4
+3793,alias_default_1459,call_function,alias.default,backward,23,1,1,0,3553,0,3
+3794,alias_default_862,call_function,alias.default,backward,23,1,1,2,3552,2637,4
+3795,mul_284,call_function,mul.Tensor,backward,23,2,2,1,3553,2625,8
+3796,mul_285,call_function,mul.Tensor,backward,23,2,2,1,3553,2629,8
+3797,alias_default_863,call_function,alias.default,backward,23,1,1,2,3554,2624,4
+3798,einsum_default_257,call_function,einsum.default,backward,23,2,2,1,3555,3,5
+3799,permute_447,call_function,permute.default,backward,23,1,1,1,4,2620,3
+3800,einsum_default_258,call_function,einsum.default,backward,23,2,2,1,3556,2619,5
+3801,permute_448,call_function,permute.default,backward,23,1,1,1,3556,2,4
+3802,dtype_cast_294,call_function,dtype_cast.default,backward,23,1,1,1,3557,1,4
+3803,alias_default_1460,call_function,alias.default,backward,23,1,1,0,3558,0,3
+3804,convert_element_type_873,call_function,convert_element_type.default,backward,23,1,1,1,3554,2628,6
+3805,convert_element_type_874,call_function,convert_element_type.default,backward,23,1,1,1,2626,2638,4
+3806,alias_default_864,call_function,alias.default,backward,23,1,1,2,2627,2637,4
+3807,neg_32,call_function,neg.default,backward,23,1,1,1,2628,2636,8
+3808,exp_32,call_function,exp.default,backward,23,1,1,1,2629,2635,6
+3809,add_169,call_function,add.Tensor,backward,23,1,1,1,2630,2634,4
+3810,reciprocal_4,call_function,reciprocal.default,backward,23,1,1,1,2631,2633,4
+3811,mul_286,call_function,mul.Tensor,backward,23,1,1,1,2632,2632,6
+3812,alias_default_865,call_function,alias.default,backward,23,1,1,2,2633,2631,4
+3813,mul_287,call_function,mul.Tensor,backward,23,2,2,1,3563,2627,8
+3814,sub_13,call_function,sub.Tensor,backward,23,1,1,1,2634,2629,4
+3815,mul_288,call_function,mul.Tensor,backward,23,2,2,1,2635,2628,8
+3816,add_170,call_function,add.Tensor,backward,23,1,1,1,2636,2627,4
+3817,mul_289,call_function,mul.Tensor,backward,23,2,2,1,3567,2626,8
+3818,convert_element_type_875,call_function,convert_element_type.default,backward,23,1,1,1,3568,2625,6
+3819,alias_default_866,call_function,alias.default,backward,23,1,1,2,3569,2624,4
+3820,einsum_default_259,call_function,einsum.default,backward,23,2,2,1,3570,3,5
+3821,permute_451,call_function,permute.default,backward,23,1,1,1,4,2620,3
+3822,einsum_default_260,call_function,einsum.default,backward,23,2,2,1,3571,2619,5
+3823,add_171,call_function,add.Tensor,unknown,,2,2,1,3576,2618,10
+3824,permute_452,call_function,permute.default,backward,23,1,1,1,3571,2,4
+3825,dtype_cast_295,call_function,dtype_cast.default,backward,23,1,1,1,3572,1,4
+3826,alias_default_1458,call_function,alias.default,backward,23,1,1,0,3573,0,3
+3827,convert_element_type_880,call_function,convert_element_type.default,backward,23,1,1,1,3577,2617,8
+3828,convert_element_type_881,call_function,convert_element_type.default,backward,23,1,1,1,2606,2617,4
+3829,convert_element_type_882,call_function,convert_element_type.default,backward,23,1,1,1,3,2611,2
+3830,alias_default_867,call_function,alias.default,backward,23,1,1,2,3578,2616,4
+3831,mul_290,call_function,mul.Tensor,backward,23,2,2,1,3580,2610,8
+3832,mul_291,call_function,mul.Tensor,backward,23,2,2,1,2614,2616,8
+3833,alias_default_868,call_function,alias.default,backward,23,1,1,2,3581,2609,4
+3834,alias_default_869,call_function,alias.default,backward,23,1,1,3,2615,2615,4
+3835,mul_292,call_function,mul.Tensor,backward,23,2,2,1,3585,2608,8
+3836,sum_19,call_function,sum.dim_IntList,backward,23,1,1,1,3586,2607,5
+3837,div_37,call_function,div.Tensor,backward,23,1,1,1,2616,2607,6
+3838,mul_293,call_function,mul.Tensor,backward,23,2,2,1,3588,2606,8
+3839,sub_14,call_function,sub.Tensor,backward,23,2,2,1,3589,2605,10
+3840,mul_294,call_function,mul.Tensor,backward,23,2,2,1,3590,2604,8
+3841,mul_295,call_function,mul.Tensor,backward,23,2,2,1,3582,4,8
+3842,sum_20,call_function,sum.dim_IntList,backward,23,1,1,1,3583,3,5
+3843,convert_element_type_883,call_function,convert_element_type.default,backward,23,1,1,1,3591,2603,6
+3844,convert_element_type_884,call_function,convert_element_type.default,backward,23,1,1,1,3584,2,3
+3845,add_172,call_function,add.Tensor,unknown,,2,2,1,3592,2602,10
+3846,dtype_cast_296,call_function,dtype_cast.default,backward,23,1,1,1,3585,1,3
+3847,alias_default_1462,call_function,alias.default,backward,23,1,1,0,3586,0,2
+3848,alias_default_870,call_function,alias.default,unknown,,1,1,3,3593,2601,4
+3849,einsum_default_261,call_function,einsum.default,backward,23,2,2,1,3594,3,5
+3850,permute_455,call_function,permute.default,backward,23,1,1,1,4,2597,3
+3851,einsum_default_262,call_function,einsum.default,backward,23,2,2,1,3595,2596,5
+3852,permute_456,call_function,permute.default,backward,23,1,1,1,3595,2,4
+3853,dtype_cast_297,call_function,dtype_cast.default,backward,23,1,1,1,3596,1,4
+3854,alias_default_1457,call_function,alias.default,backward,23,1,1,0,3597,0,3
+3855,view_744,call_function,view.default,backward,23,1,1,1,3596,2595,4
+3856,permute_457,call_function,permute.default,backward,23,1,1,1,3597,2594,4
+3857,_scaled_dot_product_flash_attention_backward_4,call_function,_scaled_dot_product_flash_attention_backward.default,backward,23,8,8,3,3601,2593,2
+3858,getitem_264,call_function,getitem,backward,23,1,1,1,3602,2566,2
+3859,getitem_265,call_function,getitem,backward,23,1,1,1,3602,2567,2
+3860,getitem_266,call_function,getitem,backward,23,1,1,1,3602,2560,2
+3861,permute_458,call_function,permute.default,backward,23,1,1,1,3603,2559,2
+3862,permute_459,call_function,permute.default,backward,23,1,1,1,3603,2566,2
+3863,permute_460,call_function,permute.default,backward,23,1,1,1,3603,2565,2
+3864,convert_element_type_889,call_function,convert_element_type.default,backward,23,1,1,1,3604,2565,2
+3865,convert_element_type_890,call_function,convert_element_type.default,backward,23,1,1,1,3604,2564,2
+3866,view_745,call_function,view.default,backward,23,1,1,1,3605,2564,2
+3867,view_as_complex_64,call_function,view_as_complex.default,backward,23,1,1,1,3606,2563,6
+3868,_conj_8,call_function,_conj.default,backward,23,1,1,1,4,2564,3
+3869,clone_38,call_function,clone.default,backward,23,1,1,1,5,2563,3
+3870,mul_296,call_function,mul.Tensor,backward,23,2,2,1,3609,2562,8
+3871,view_746,call_function,view.default,backward,23,1,1,1,3605,2563,2
+3872,view_as_complex_65,call_function,view_as_complex.default,backward,23,1,1,1,3606,2562,6
+3873,_conj_9,call_function,_conj.default,backward,23,1,1,1,4,2563,3
+3874,clone_39,call_function,clone.default,backward,23,1,1,1,5,2562,3
+3875,mul_297,call_function,mul.Tensor,backward,23,2,2,1,3609,2561,8
+3876,view_as_real_64,call_function,view_as_real.default,backward,23,1,1,1,3610,2561,6
+3877,view_747,call_function,view.default,backward,23,1,1,1,3611,2560,6
+3878,convert_element_type_891,call_function,convert_element_type.default,backward,23,1,1,1,3612,2559,6
+3879,view_as_real_65,call_function,view_as_real.default,backward,23,1,1,1,3610,2560,6
+3880,view_748,call_function,view.default,backward,23,1,1,1,3611,2559,6
+3881,convert_element_type_892,call_function,convert_element_type.default,backward,23,1,1,1,3612,2558,6
+3882,view_749,call_function,view.default,backward,23,1,1,1,3604,2558,2
+3883,view_750,call_function,view.default,backward,23,1,1,1,3613,2558,5
+3884,view_751,call_function,view.default,backward,23,1,1,1,3613,2557,5
+3885,alias_default_871,call_function,alias.default,backward,23,1,1,2,3605,2557,4
+3886,einsum_default_263,call_function,einsum.default,backward,23,2,2,1,3606,3,5
+3887,permute_463,call_function,permute.default,backward,23,1,1,1,4,2553,3
+3888,einsum_default_264,call_function,einsum.default,backward,23,2,2,1,3607,2552,5
+3889,permute_464,call_function,permute.default,backward,23,1,1,1,3607,2,4
+3890,dtype_cast_298,call_function,dtype_cast.default,backward,23,1,1,1,3608,1,4
+3891,alias_default_1456,call_function,alias.default,backward,23,1,1,0,3609,0,3
+3892,alias_default_872,call_function,alias.default,backward,23,1,1,2,3614,2557,4
+3893,einsum_default_265,call_function,einsum.default,backward,23,2,2,1,3615,3,5
+3894,permute_467,call_function,permute.default,backward,23,1,1,1,4,2553,3
+3895,einsum_default_266,call_function,einsum.default,backward,23,2,2,1,3616,2552,5
+3896,add_173,call_function,add.Tensor,unknown,,2,2,1,3623,2551,10
+3897,permute_468,call_function,permute.default,backward,23,1,1,1,3616,2,4
+3898,dtype_cast_299,call_function,dtype_cast.default,backward,23,1,1,1,3617,1,4
+3899,alias_default_1455,call_function,alias.default,backward,23,1,1,0,3618,0,3
+3900,alias_default_873,call_function,alias.default,backward,23,1,1,2,3614,2556,4
+3901,einsum_default_267,call_function,einsum.default,backward,23,2,2,1,3615,3,5
+3902,permute_471,call_function,permute.default,backward,23,1,1,1,4,2552,3
+3903,einsum_default_268,call_function,einsum.default,backward,23,2,2,1,3616,2551,5
+3904,add_174,call_function,add.Tensor,unknown,,2,2,1,3639,2550,10
+3905,permute_472,call_function,permute.default,backward,23,1,1,1,3616,2,4
+3906,dtype_cast_300,call_function,dtype_cast.default,backward,23,1,1,1,3617,1,4
+3907,alias_default_1454,call_function,alias.default,backward,23,1,1,0,3618,0,3
+3908,convert_element_type_905,call_function,convert_element_type.default,backward,23,1,1,1,3640,2549,8
+3909,convert_element_type_906,call_function,convert_element_type.default,backward,23,1,1,1,2539,2549,4
+3910,convert_element_type_907,call_function,convert_element_type.default,backward,23,1,1,1,3,2543,2
+3911,alias_default_874,call_function,alias.default,backward,23,1,1,2,3641,2548,4
+3912,mul_298,call_function,mul.Tensor,backward,23,2,2,1,3643,2542,8
+3913,mul_299,call_function,mul.Tensor,backward,23,2,2,1,2547,2548,8
+3914,alias_default_875,call_function,alias.default,backward,23,1,1,2,3644,2541,4
+3915,alias_default_876,call_function,alias.default,backward,23,1,1,3,2548,2547,4
+3916,mul_300,call_function,mul.Tensor,backward,23,2,2,1,3648,2540,8
+3917,sum_21,call_function,sum.dim_IntList,backward,23,1,1,1,3649,2539,5
+3918,div_38,call_function,div.Tensor,backward,23,1,1,1,2549,2539,6
+3919,mul_301,call_function,mul.Tensor,backward,23,2,2,1,3651,2538,8
+3920,sub_15,call_function,sub.Tensor,backward,23,2,2,1,3652,2537,10
+3921,mul_302,call_function,mul.Tensor,backward,23,2,2,1,3653,2536,8
+3922,mul_303,call_function,mul.Tensor,backward,23,2,2,1,3645,4,8
+3923,sum_22,call_function,sum.dim_IntList,backward,23,1,1,1,3646,3,5
+3924,convert_element_type_908,call_function,convert_element_type.default,backward,23,1,1,1,3654,2535,6
+3925,convert_element_type_909,call_function,convert_element_type.default,backward,23,1,1,1,3647,2,3
+3926,add_175,call_function,add.Tensor,unknown,,2,2,1,3655,2534,10
+3927,dtype_cast_301,call_function,dtype_cast.default,backward,23,1,1,1,3648,1,3
+3928,alias_default_1461,call_function,alias.default,backward,23,1,1,0,3649,0,2
+3929,alias_default_877,call_function,alias.default,unknown,,1,1,3,3656,2533,4
+3930,einsum_default_269,call_function,einsum.default,backward,22,2,2,1,3657,3,5
+3931,permute_475,call_function,permute.default,backward,22,1,1,1,4,2529,3
+3932,einsum_default_270,call_function,einsum.default,backward,22,2,2,1,3658,2528,5
+3933,permute_476,call_function,permute.default,backward,22,1,1,1,3658,2,4
+3934,dtype_cast_302,call_function,dtype_cast.default,backward,22,1,1,1,3659,1,4
+3935,alias_default_1450,call_function,alias.default,backward,22,1,1,0,3660,0,3
+3936,alias_default_878,call_function,alias.default,backward,22,1,1,2,3659,2527,4
+3937,mul_304,call_function,mul.Tensor,backward,22,2,2,1,3660,2515,8
+3938,mul_305,call_function,mul.Tensor,backward,22,2,2,1,3660,2519,8
+3939,alias_default_879,call_function,alias.default,backward,22,1,1,2,3661,2514,4
+3940,einsum_default_271,call_function,einsum.default,backward,22,2,2,1,3662,3,5
+3941,permute_479,call_function,permute.default,backward,22,1,1,1,4,2510,3
+3942,einsum_default_272,call_function,einsum.default,backward,22,2,2,1,3663,2509,5
+3943,permute_480,call_function,permute.default,backward,22,1,1,1,3663,2,4
+3944,dtype_cast_303,call_function,dtype_cast.default,backward,22,1,1,1,3664,1,4
+3945,alias_default_1451,call_function,alias.default,backward,22,1,1,0,3665,0,3
+3946,convert_element_type_918,call_function,convert_element_type.default,backward,22,1,1,1,3661,2518,6
+3947,convert_element_type_919,call_function,convert_element_type.default,backward,22,1,1,1,2516,2528,4
+3948,alias_default_880,call_function,alias.default,backward,22,1,1,2,2517,2527,4
+3949,neg_33,call_function,neg.default,backward,22,1,1,1,2518,2526,8
+3950,exp_33,call_function,exp.default,backward,22,1,1,1,2519,2525,6
+3951,add_176,call_function,add.Tensor,backward,22,1,1,1,2520,2524,4
+3952,reciprocal_5,call_function,reciprocal.default,backward,22,1,1,1,2521,2523,4
+3953,mul_306,call_function,mul.Tensor,backward,22,1,1,1,2522,2522,6
+3954,alias_default_881,call_function,alias.default,backward,22,1,1,2,2523,2521,4
+3955,mul_307,call_function,mul.Tensor,backward,22,2,2,1,3670,2517,8
+3956,sub_16,call_function,sub.Tensor,backward,22,1,1,1,2524,2519,4
+3957,mul_308,call_function,mul.Tensor,backward,22,2,2,1,2525,2518,8
+3958,add_177,call_function,add.Tensor,backward,22,1,1,1,2526,2517,4
+3959,mul_309,call_function,mul.Tensor,backward,22,2,2,1,3674,2516,8
+3960,convert_element_type_920,call_function,convert_element_type.default,backward,22,1,1,1,3675,2515,6
+3961,alias_default_882,call_function,alias.default,backward,22,1,1,2,3676,2514,4
+3962,einsum_default_273,call_function,einsum.default,backward,22,2,2,1,3677,3,5
+3963,permute_483,call_function,permute.default,backward,22,1,1,1,4,2510,3
+3964,einsum_default_274,call_function,einsum.default,backward,22,2,2,1,3678,2509,5
+3965,add_178,call_function,add.Tensor,unknown,,2,2,1,3683,2508,10
+3966,permute_484,call_function,permute.default,backward,22,1,1,1,3678,2,4
+3967,dtype_cast_304,call_function,dtype_cast.default,backward,22,1,1,1,3679,1,4
+3968,alias_default_1449,call_function,alias.default,backward,22,1,1,0,3680,0,3
+3969,convert_element_type_925,call_function,convert_element_type.default,backward,22,1,1,1,3684,2507,8
+3970,convert_element_type_926,call_function,convert_element_type.default,backward,22,1,1,1,2496,2507,4
+3971,convert_element_type_927,call_function,convert_element_type.default,backward,22,1,1,1,3,2501,2
+3972,alias_default_883,call_function,alias.default,backward,22,1,1,2,3685,2506,4
+3973,mul_310,call_function,mul.Tensor,backward,22,2,2,1,3687,2500,8
+3974,mul_311,call_function,mul.Tensor,backward,22,2,2,1,2504,2506,8
+3975,alias_default_884,call_function,alias.default,backward,22,1,1,2,3688,2499,4
+3976,alias_default_885,call_function,alias.default,backward,22,1,1,3,2505,2505,4
+3977,mul_312,call_function,mul.Tensor,backward,22,2,2,1,3692,2498,8
+3978,sum_23,call_function,sum.dim_IntList,backward,22,1,1,1,3693,2497,5
+3979,div_39,call_function,div.Tensor,backward,22,1,1,1,2506,2497,6
+3980,mul_313,call_function,mul.Tensor,backward,22,2,2,1,3695,2496,8
+3981,sub_17,call_function,sub.Tensor,backward,22,2,2,1,3696,2495,10
+3982,mul_314,call_function,mul.Tensor,backward,22,2,2,1,3697,2494,8
+3983,mul_315,call_function,mul.Tensor,backward,22,2,2,1,3689,4,8
+3984,sum_24,call_function,sum.dim_IntList,backward,22,1,1,1,3690,3,5
+3985,convert_element_type_928,call_function,convert_element_type.default,backward,22,1,1,1,3698,2493,6
+3986,convert_element_type_929,call_function,convert_element_type.default,backward,22,1,1,1,3691,2,3
+3987,add_179,call_function,add.Tensor,unknown,,2,2,1,3699,2492,10
+3988,dtype_cast_305,call_function,dtype_cast.default,backward,22,1,1,1,3692,1,3
+3989,alias_default_1453,call_function,alias.default,backward,22,1,1,0,3693,0,2
+3990,alias_default_886,call_function,alias.default,unknown,,1,1,3,3700,2491,4
+3991,einsum_default_275,call_function,einsum.default,backward,22,2,2,1,3701,3,5
+3992,permute_487,call_function,permute.default,backward,22,1,1,1,4,2487,3
+3993,einsum_default_276,call_function,einsum.default,backward,22,2,2,1,3702,2486,5
+3994,permute_488,call_function,permute.default,backward,22,1,1,1,3702,2,4
+3995,dtype_cast_306,call_function,dtype_cast.default,backward,22,1,1,1,3703,1,4
+3996,alias_default_1448,call_function,alias.default,backward,22,1,1,0,3704,0,3
+3997,view_766,call_function,view.default,backward,22,1,1,1,3703,2485,4
+3998,permute_489,call_function,permute.default,backward,22,1,1,1,3704,2484,4
+3999,_scaled_dot_product_flash_attention_backward_5,call_function,_scaled_dot_product_flash_attention_backward.default,backward,22,8,8,3,3708,2483,2
+4000,getitem_267,call_function,getitem,backward,22,1,1,1,3709,2456,2
+4001,getitem_268,call_function,getitem,backward,22,1,1,1,3709,2457,2
+4002,getitem_269,call_function,getitem,backward,22,1,1,1,3709,2450,2
+4003,permute_490,call_function,permute.default,backward,22,1,1,1,3710,2449,2
+4004,permute_491,call_function,permute.default,backward,22,1,1,1,3710,2456,2
+4005,permute_492,call_function,permute.default,backward,22,1,1,1,3710,2455,2
+4006,convert_element_type_934,call_function,convert_element_type.default,backward,22,1,1,1,3711,2455,2
+4007,convert_element_type_935,call_function,convert_element_type.default,backward,22,1,1,1,3711,2454,2
+4008,view_767,call_function,view.default,backward,22,1,1,1,3712,2454,2
+4009,view_as_complex_66,call_function,view_as_complex.default,backward,22,1,1,1,3713,2453,6
+4010,_conj_10,call_function,_conj.default,backward,22,1,1,1,4,2454,3
+4011,clone_46,call_function,clone.default,backward,22,1,1,1,5,2453,3
+4012,mul_316,call_function,mul.Tensor,backward,22,2,2,1,3716,2452,8
+4013,view_768,call_function,view.default,backward,22,1,1,1,3712,2453,2
+4014,view_as_complex_67,call_function,view_as_complex.default,backward,22,1,1,1,3713,2452,6
+4015,_conj_11,call_function,_conj.default,backward,22,1,1,1,4,2453,3
+4016,clone_47,call_function,clone.default,backward,22,1,1,1,5,2452,3
+4017,mul_317,call_function,mul.Tensor,backward,22,2,2,1,3716,2451,8
+4018,view_as_real_66,call_function,view_as_real.default,backward,22,1,1,1,3717,2451,6
+4019,view_769,call_function,view.default,backward,22,1,1,1,3718,2450,6
+4020,convert_element_type_936,call_function,convert_element_type.default,backward,22,1,1,1,3719,2449,6
+4021,view_as_real_67,call_function,view_as_real.default,backward,22,1,1,1,3717,2450,6
+4022,view_770,call_function,view.default,backward,22,1,1,1,3718,2449,6
+4023,convert_element_type_937,call_function,convert_element_type.default,backward,22,1,1,1,3719,2448,6
+4024,view_771,call_function,view.default,backward,22,1,1,1,3711,2448,2
+4025,view_772,call_function,view.default,backward,22,1,1,1,3720,2448,5
+4026,view_773,call_function,view.default,backward,22,1,1,1,3720,2447,5
+4027,alias_default_887,call_function,alias.default,backward,22,1,1,2,3712,2447,4
+4028,einsum_default_277,call_function,einsum.default,backward,22,2,2,1,3713,3,5
+4029,permute_495,call_function,permute.default,backward,22,1,1,1,4,2443,3
+4030,einsum_default_278,call_function,einsum.default,backward,22,2,2,1,3714,2442,5
+4031,permute_496,call_function,permute.default,backward,22,1,1,1,3714,2,4
+4032,dtype_cast_307,call_function,dtype_cast.default,backward,22,1,1,1,3715,1,4
+4033,alias_default_1447,call_function,alias.default,backward,22,1,1,0,3716,0,3
+4034,alias_default_888,call_function,alias.default,backward,22,1,1,2,3721,2447,4
+4035,einsum_default_279,call_function,einsum.default,backward,22,2,2,1,3722,3,5
+4036,permute_499,call_function,permute.default,backward,22,1,1,1,4,2443,3
+4037,einsum_default_280,call_function,einsum.default,backward,22,2,2,1,3723,2442,5
+4038,add_180,call_function,add.Tensor,unknown,,2,2,1,3730,2441,10
+4039,permute_500,call_function,permute.default,backward,22,1,1,1,3723,2,4
+4040,dtype_cast_308,call_function,dtype_cast.default,backward,22,1,1,1,3724,1,4
+4041,alias_default_1446,call_function,alias.default,backward,22,1,1,0,3725,0,3
+4042,alias_default_889,call_function,alias.default,backward,22,1,1,2,3721,2446,4
+4043,einsum_default_281,call_function,einsum.default,backward,22,2,2,1,3722,3,5
+4044,permute_503,call_function,permute.default,backward,22,1,1,1,4,2442,3
+4045,einsum_default_282,call_function,einsum.default,backward,22,2,2,1,3723,2441,5
+4046,add_181,call_function,add.Tensor,unknown,,2,2,1,3746,2440,10
+4047,permute_504,call_function,permute.default,backward,22,1,1,1,3723,2,4
+4048,dtype_cast_309,call_function,dtype_cast.default,backward,22,1,1,1,3724,1,4
+4049,alias_default_1445,call_function,alias.default,backward,22,1,1,0,3725,0,3
+4050,convert_element_type_950,call_function,convert_element_type.default,backward,22,1,1,1,3747,2439,8
+4051,convert_element_type_951,call_function,convert_element_type.default,backward,22,1,1,1,2429,2439,4
+4052,convert_element_type_952,call_function,convert_element_type.default,backward,22,1,1,1,3,2433,2
+4053,alias_default_890,call_function,alias.default,backward,22,1,1,2,3748,2438,4
+4054,mul_318,call_function,mul.Tensor,backward,22,2,2,1,3750,2432,8
+4055,mul_319,call_function,mul.Tensor,backward,22,2,2,1,2437,2438,8
+4056,alias_default_891,call_function,alias.default,backward,22,1,1,2,3751,2431,4
+4057,alias_default_892,call_function,alias.default,backward,22,1,1,3,2438,2437,4
+4058,mul_320,call_function,mul.Tensor,backward,22,2,2,1,3755,2430,8
+4059,sum_25,call_function,sum.dim_IntList,backward,22,1,1,1,3756,2429,5
+4060,div_40,call_function,div.Tensor,backward,22,1,1,1,2439,2429,6
+4061,mul_321,call_function,mul.Tensor,backward,22,2,2,1,3758,2428,8
+4062,sub_18,call_function,sub.Tensor,backward,22,2,2,1,3759,2427,10
+4063,mul_322,call_function,mul.Tensor,backward,22,2,2,1,3760,2426,8
+4064,mul_323,call_function,mul.Tensor,backward,22,2,2,1,3752,4,8
+4065,sum_26,call_function,sum.dim_IntList,backward,22,1,1,1,3753,3,5
+4066,convert_element_type_953,call_function,convert_element_type.default,backward,22,1,1,1,3761,2425,6
+4067,convert_element_type_954,call_function,convert_element_type.default,backward,22,1,1,1,3754,2,3
+4068,add_182,call_function,add.Tensor,unknown,,2,2,1,3762,2424,10
+4069,dtype_cast_310,call_function,dtype_cast.default,backward,22,1,1,1,3755,1,3
+4070,alias_default_1452,call_function,alias.default,backward,22,1,1,0,3756,0,2
+4071,alias_default_893,call_function,alias.default,unknown,,1,1,3,3763,2423,4
+4072,einsum_default_283,call_function,einsum.default,backward,21,2,2,1,3764,3,5
+4073,permute_507,call_function,permute.default,backward,21,1,1,1,4,2419,3
+4074,einsum_default_284,call_function,einsum.default,backward,21,2,2,1,3765,2418,5
+4075,permute_508,call_function,permute.default,backward,21,1,1,1,3765,2,4
+4076,dtype_cast_311,call_function,dtype_cast.default,backward,21,1,1,1,3766,1,4
+4077,alias_default_1441,call_function,alias.default,backward,21,1,1,0,3767,0,3
+4078,alias_default_894,call_function,alias.default,backward,21,1,1,2,3766,2417,4
+4079,mul_324,call_function,mul.Tensor,backward,21,2,2,1,3767,2405,8
+4080,mul_325,call_function,mul.Tensor,backward,21,2,2,1,3767,2409,8
+4081,alias_default_895,call_function,alias.default,backward,21,1,1,2,3768,2404,4
+4082,einsum_default_285,call_function,einsum.default,backward,21,2,2,1,3769,3,5
+4083,permute_511,call_function,permute.default,backward,21,1,1,1,4,2400,3
+4084,einsum_default_286,call_function,einsum.default,backward,21,2,2,1,3770,2399,5
+4085,permute_512,call_function,permute.default,backward,21,1,1,1,3770,2,4
+4086,dtype_cast_312,call_function,dtype_cast.default,backward,21,1,1,1,3771,1,4
+4087,alias_default_1442,call_function,alias.default,backward,21,1,1,0,3772,0,3
+4088,convert_element_type_963,call_function,convert_element_type.default,backward,21,1,1,1,3768,2408,6
+4089,convert_element_type_964,call_function,convert_element_type.default,backward,21,1,1,1,2406,2418,4
+4090,alias_default_896,call_function,alias.default,backward,21,1,1,2,2407,2417,4
+4091,neg_34,call_function,neg.default,backward,21,1,1,1,2408,2416,8
+4092,exp_34,call_function,exp.default,backward,21,1,1,1,2409,2415,6
+4093,add_183,call_function,add.Tensor,backward,21,1,1,1,2410,2414,4
+4094,reciprocal_6,call_function,reciprocal.default,backward,21,1,1,1,2411,2413,4
+4095,mul_326,call_function,mul.Tensor,backward,21,1,1,1,2412,2412,6
+4096,alias_default_897,call_function,alias.default,backward,21,1,1,2,2413,2411,4
+4097,mul_327,call_function,mul.Tensor,backward,21,2,2,1,3777,2407,8
+4098,sub_19,call_function,sub.Tensor,backward,21,1,1,1,2414,2409,4
+4099,mul_328,call_function,mul.Tensor,backward,21,2,2,1,2415,2408,8
+4100,add_184,call_function,add.Tensor,backward,21,1,1,1,2416,2407,4
+4101,mul_329,call_function,mul.Tensor,backward,21,2,2,1,3781,2406,8
+4102,convert_element_type_965,call_function,convert_element_type.default,backward,21,1,1,1,3782,2405,6
+4103,alias_default_898,call_function,alias.default,backward,21,1,1,2,3783,2404,4
+4104,einsum_default_287,call_function,einsum.default,backward,21,2,2,1,3784,3,5
+4105,permute_515,call_function,permute.default,backward,21,1,1,1,4,2400,3
+4106,einsum_default_288,call_function,einsum.default,backward,21,2,2,1,3785,2399,5
+4107,add_185,call_function,add.Tensor,unknown,,2,2,1,3790,2398,10
+4108,permute_516,call_function,permute.default,backward,21,1,1,1,3785,2,4
+4109,dtype_cast_313,call_function,dtype_cast.default,backward,21,1,1,1,3786,1,4
+4110,alias_default_1440,call_function,alias.default,backward,21,1,1,0,3787,0,3
+4111,convert_element_type_970,call_function,convert_element_type.default,backward,21,1,1,1,3791,2397,8
+4112,convert_element_type_971,call_function,convert_element_type.default,backward,21,1,1,1,2386,2397,4
+4113,convert_element_type_972,call_function,convert_element_type.default,backward,21,1,1,1,3,2391,2
+4114,alias_default_899,call_function,alias.default,backward,21,1,1,2,3792,2396,4
+4115,mul_330,call_function,mul.Tensor,backward,21,2,2,1,3794,2390,8
+4116,mul_331,call_function,mul.Tensor,backward,21,2,2,1,2394,2396,8
+4117,alias_default_900,call_function,alias.default,backward,21,1,1,2,3795,2389,4
+4118,alias_default_901,call_function,alias.default,backward,21,1,1,3,2395,2395,4
+4119,mul_332,call_function,mul.Tensor,backward,21,2,2,1,3799,2388,8
+4120,sum_27,call_function,sum.dim_IntList,backward,21,1,1,1,3800,2387,5
+4121,div_41,call_function,div.Tensor,backward,21,1,1,1,2396,2387,6
+4122,mul_333,call_function,mul.Tensor,backward,21,2,2,1,3802,2386,8
+4123,sub_20,call_function,sub.Tensor,backward,21,2,2,1,3803,2385,10
+4124,mul_334,call_function,mul.Tensor,backward,21,2,2,1,3804,2384,8
+4125,mul_335,call_function,mul.Tensor,backward,21,2,2,1,3796,4,8
+4126,sum_28,call_function,sum.dim_IntList,backward,21,1,1,1,3797,3,5
+4127,convert_element_type_973,call_function,convert_element_type.default,backward,21,1,1,1,3805,2383,6
+4128,convert_element_type_974,call_function,convert_element_type.default,backward,21,1,1,1,3798,2,3
+4129,add_186,call_function,add.Tensor,unknown,,2,2,1,3806,2382,10
+4130,dtype_cast_314,call_function,dtype_cast.default,backward,21,1,1,1,3799,1,3
+4131,alias_default_1444,call_function,alias.default,backward,21,1,1,0,3800,0,2
+4132,alias_default_902,call_function,alias.default,unknown,,1,1,3,3807,2381,4
+4133,einsum_default_289,call_function,einsum.default,backward,21,2,2,1,3808,3,5
+4134,permute_519,call_function,permute.default,backward,21,1,1,1,4,2377,3
+4135,einsum_default_290,call_function,einsum.default,backward,21,2,2,1,3809,2376,5
+4136,permute_520,call_function,permute.default,backward,21,1,1,1,3809,2,4
+4137,dtype_cast_315,call_function,dtype_cast.default,backward,21,1,1,1,3810,1,4
+4138,alias_default_1439,call_function,alias.default,backward,21,1,1,0,3811,0,3
+4139,view_788,call_function,view.default,backward,21,1,1,1,3810,2375,4
+4140,permute_521,call_function,permute.default,backward,21,1,1,1,3811,2374,4
+4141,_scaled_dot_product_flash_attention_backward_6,call_function,_scaled_dot_product_flash_attention_backward.default,backward,21,8,8,3,3815,2373,2
+4142,getitem_270,call_function,getitem,backward,21,1,1,1,3816,2346,2
+4143,getitem_271,call_function,getitem,backward,21,1,1,1,3816,2347,2
+4144,getitem_272,call_function,getitem,backward,21,1,1,1,3816,2340,2
+4145,permute_522,call_function,permute.default,backward,21,1,1,1,3817,2339,2
+4146,permute_523,call_function,permute.default,backward,21,1,1,1,3817,2346,2
+4147,permute_524,call_function,permute.default,backward,21,1,1,1,3817,2345,2
+4148,convert_element_type_979,call_function,convert_element_type.default,backward,21,1,1,1,3818,2345,2
+4149,convert_element_type_980,call_function,convert_element_type.default,backward,21,1,1,1,3818,2344,2
+4150,view_789,call_function,view.default,backward,21,1,1,1,3819,2344,2
+4151,view_as_complex_68,call_function,view_as_complex.default,backward,21,1,1,1,3820,2343,6
+4152,_conj_12,call_function,_conj.default,backward,21,1,1,1,4,2344,3
+4153,clone_54,call_function,clone.default,backward,21,1,1,1,5,2343,3
+4154,mul_336,call_function,mul.Tensor,backward,21,2,2,1,3823,2342,8
+4155,view_790,call_function,view.default,backward,21,1,1,1,3819,2343,2
+4156,view_as_complex_69,call_function,view_as_complex.default,backward,21,1,1,1,3820,2342,6
+4157,_conj_13,call_function,_conj.default,backward,21,1,1,1,4,2343,3
+4158,clone_55,call_function,clone.default,backward,21,1,1,1,5,2342,3
+4159,mul_337,call_function,mul.Tensor,backward,21,2,2,1,3823,2341,8
+4160,view_as_real_68,call_function,view_as_real.default,backward,21,1,1,1,3824,2341,6
+4161,view_791,call_function,view.default,backward,21,1,1,1,3825,2340,6
+4162,convert_element_type_981,call_function,convert_element_type.default,backward,21,1,1,1,3826,2339,6
+4163,view_as_real_69,call_function,view_as_real.default,backward,21,1,1,1,3824,2340,6
+4164,view_792,call_function,view.default,backward,21,1,1,1,3825,2339,6
+4165,convert_element_type_982,call_function,convert_element_type.default,backward,21,1,1,1,3826,2338,6
+4166,view_793,call_function,view.default,backward,21,1,1,1,3818,2338,2
+4167,view_794,call_function,view.default,backward,21,1,1,1,3827,2338,5
+4168,view_795,call_function,view.default,backward,21,1,1,1,3827,2337,5
+4169,alias_default_903,call_function,alias.default,backward,21,1,1,2,3819,2337,4
+4170,einsum_default_291,call_function,einsum.default,backward,21,2,2,1,3820,3,5
+4171,permute_527,call_function,permute.default,backward,21,1,1,1,4,2333,3
+4172,einsum_default_292,call_function,einsum.default,backward,21,2,2,1,3821,2332,5
+4173,permute_528,call_function,permute.default,backward,21,1,1,1,3821,2,4
+4174,dtype_cast_316,call_function,dtype_cast.default,backward,21,1,1,1,3822,1,4
+4175,alias_default_1438,call_function,alias.default,backward,21,1,1,0,3823,0,3
+4176,alias_default_904,call_function,alias.default,backward,21,1,1,2,3828,2337,4
+4177,einsum_default_293,call_function,einsum.default,backward,21,2,2,1,3829,3,5
+4178,permute_531,call_function,permute.default,backward,21,1,1,1,4,2333,3
+4179,einsum_default_294,call_function,einsum.default,backward,21,2,2,1,3830,2332,5
+4180,add_187,call_function,add.Tensor,unknown,,2,2,1,3837,2331,10
+4181,permute_532,call_function,permute.default,backward,21,1,1,1,3830,2,4
+4182,dtype_cast_317,call_function,dtype_cast.default,backward,21,1,1,1,3831,1,4
+4183,alias_default_1437,call_function,alias.default,backward,21,1,1,0,3832,0,3
+4184,alias_default_905,call_function,alias.default,backward,21,1,1,2,3828,2336,4
+4185,einsum_default_295,call_function,einsum.default,backward,21,2,2,1,3829,3,5
+4186,permute_535,call_function,permute.default,backward,21,1,1,1,4,2332,3
+4187,einsum_default_296,call_function,einsum.default,backward,21,2,2,1,3830,2331,5
+4188,add_188,call_function,add.Tensor,unknown,,2,2,1,3853,2330,10
+4189,permute_536,call_function,permute.default,backward,21,1,1,1,3830,2,4
+4190,dtype_cast_318,call_function,dtype_cast.default,backward,21,1,1,1,3831,1,4
+4191,alias_default_1436,call_function,alias.default,backward,21,1,1,0,3832,0,3
+4192,convert_element_type_995,call_function,convert_element_type.default,backward,21,1,1,1,3854,2329,8
+4193,convert_element_type_996,call_function,convert_element_type.default,backward,21,1,1,1,2319,2329,4
+4194,convert_element_type_997,call_function,convert_element_type.default,backward,21,1,1,1,3,2323,2
+4195,alias_default_906,call_function,alias.default,backward,21,1,1,2,3855,2328,4
+4196,mul_338,call_function,mul.Tensor,backward,21,2,2,1,3857,2322,8
+4197,mul_339,call_function,mul.Tensor,backward,21,2,2,1,2327,2328,8
+4198,alias_default_907,call_function,alias.default,backward,21,1,1,2,3858,2321,4
+4199,alias_default_908,call_function,alias.default,backward,21,1,1,3,2328,2327,4
+4200,mul_340,call_function,mul.Tensor,backward,21,2,2,1,3862,2320,8
+4201,sum_29,call_function,sum.dim_IntList,backward,21,1,1,1,3863,2319,5
+4202,div_42,call_function,div.Tensor,backward,21,1,1,1,2329,2319,6
+4203,mul_341,call_function,mul.Tensor,backward,21,2,2,1,3865,2318,8
+4204,sub_21,call_function,sub.Tensor,backward,21,2,2,1,3866,2317,10
+4205,mul_342,call_function,mul.Tensor,backward,21,2,2,1,3867,2316,8
+4206,mul_343,call_function,mul.Tensor,backward,21,2,2,1,3859,4,8
+4207,sum_30,call_function,sum.dim_IntList,backward,21,1,1,1,3860,3,5
+4208,convert_element_type_998,call_function,convert_element_type.default,backward,21,1,1,1,3868,2315,6
+4209,convert_element_type_999,call_function,convert_element_type.default,backward,21,1,1,1,3861,2,3
+4210,add_189,call_function,add.Tensor,unknown,,2,2,1,3869,2314,10
+4211,dtype_cast_319,call_function,dtype_cast.default,backward,21,1,1,1,3862,1,3
+4212,alias_default_1443,call_function,alias.default,backward,21,1,1,0,3863,0,2
+4213,alias_default_909,call_function,alias.default,unknown,,1,1,3,3870,2313,4
+4214,einsum_default_297,call_function,einsum.default,backward,20,2,2,1,3871,3,5
+4215,permute_539,call_function,permute.default,backward,20,1,1,1,4,2309,3
+4216,einsum_default_298,call_function,einsum.default,backward,20,2,2,1,3872,2308,5
+4217,permute_540,call_function,permute.default,backward,20,1,1,1,3872,2,4
+4218,dtype_cast_320,call_function,dtype_cast.default,backward,20,1,1,1,3873,1,4
+4219,alias_default_1432,call_function,alias.default,backward,20,1,1,0,3874,0,3
+4220,alias_default_910,call_function,alias.default,backward,20,1,1,2,3873,2307,4
+4221,mul_344,call_function,mul.Tensor,backward,20,2,2,1,3874,2295,8
+4222,mul_345,call_function,mul.Tensor,backward,20,2,2,1,3874,2299,8
+4223,alias_default_911,call_function,alias.default,backward,20,1,1,2,3875,2294,4
+4224,einsum_default_299,call_function,einsum.default,backward,20,2,2,1,3876,3,5
+4225,permute_543,call_function,permute.default,backward,20,1,1,1,4,2290,3
+4226,einsum_default_300,call_function,einsum.default,backward,20,2,2,1,3877,2289,5
+4227,permute_544,call_function,permute.default,backward,20,1,1,1,3877,2,4
+4228,dtype_cast_321,call_function,dtype_cast.default,backward,20,1,1,1,3878,1,4
+4229,alias_default_1433,call_function,alias.default,backward,20,1,1,0,3879,0,3
+4230,convert_element_type_1008,call_function,convert_element_type.default,backward,20,1,1,1,3875,2298,6
+4231,convert_element_type_1009,call_function,convert_element_type.default,backward,20,1,1,1,2296,2308,4
+4232,alias_default_912,call_function,alias.default,backward,20,1,1,2,2297,2307,4
+4233,neg_35,call_function,neg.default,backward,20,1,1,1,2298,2306,8
+4234,exp_35,call_function,exp.default,backward,20,1,1,1,2299,2305,6
+4235,add_190,call_function,add.Tensor,backward,20,1,1,1,2300,2304,4
+4236,reciprocal_7,call_function,reciprocal.default,backward,20,1,1,1,2301,2303,4
+4237,mul_346,call_function,mul.Tensor,backward,20,1,1,1,2302,2302,6
+4238,alias_default_913,call_function,alias.default,backward,20,1,1,2,2303,2301,4
+4239,mul_347,call_function,mul.Tensor,backward,20,2,2,1,3884,2297,8
+4240,sub_22,call_function,sub.Tensor,backward,20,1,1,1,2304,2299,4
+4241,mul_348,call_function,mul.Tensor,backward,20,2,2,1,2305,2298,8
+4242,add_191,call_function,add.Tensor,backward,20,1,1,1,2306,2297,4
+4243,mul_349,call_function,mul.Tensor,backward,20,2,2,1,3888,2296,8
+4244,convert_element_type_1010,call_function,convert_element_type.default,backward,20,1,1,1,3889,2295,6
+4245,alias_default_914,call_function,alias.default,backward,20,1,1,2,3890,2294,4
+4246,einsum_default_301,call_function,einsum.default,backward,20,2,2,1,3891,3,5
+4247,permute_547,call_function,permute.default,backward,20,1,1,1,4,2290,3
+4248,einsum_default_302,call_function,einsum.default,backward,20,2,2,1,3892,2289,5
+4249,add_192,call_function,add.Tensor,unknown,,2,2,1,3897,2288,10
+4250,permute_548,call_function,permute.default,backward,20,1,1,1,3892,2,4
+4251,dtype_cast_322,call_function,dtype_cast.default,backward,20,1,1,1,3893,1,4
+4252,alias_default_1431,call_function,alias.default,backward,20,1,1,0,3894,0,3
+4253,convert_element_type_1015,call_function,convert_element_type.default,backward,20,1,1,1,3898,2287,8
+4254,convert_element_type_1016,call_function,convert_element_type.default,backward,20,1,1,1,2276,2287,4
+4255,convert_element_type_1017,call_function,convert_element_type.default,backward,20,1,1,1,3,2281,2
+4256,alias_default_915,call_function,alias.default,backward,20,1,1,2,3899,2286,4
+4257,mul_350,call_function,mul.Tensor,backward,20,2,2,1,3901,2280,8
+4258,mul_351,call_function,mul.Tensor,backward,20,2,2,1,2284,2286,8
+4259,alias_default_916,call_function,alias.default,backward,20,1,1,2,3902,2279,4
+4260,alias_default_917,call_function,alias.default,backward,20,1,1,3,2285,2285,4
+4261,mul_352,call_function,mul.Tensor,backward,20,2,2,1,3906,2278,8
+4262,sum_31,call_function,sum.dim_IntList,backward,20,1,1,1,3907,2277,5
+4263,div_43,call_function,div.Tensor,backward,20,1,1,1,2286,2277,6
+4264,mul_353,call_function,mul.Tensor,backward,20,2,2,1,3909,2276,8
+4265,sub_23,call_function,sub.Tensor,backward,20,2,2,1,3910,2275,10
+4266,mul_354,call_function,mul.Tensor,backward,20,2,2,1,3911,2274,8
+4267,mul_355,call_function,mul.Tensor,backward,20,2,2,1,3903,4,8
+4268,sum_32,call_function,sum.dim_IntList,backward,20,1,1,1,3904,3,5
+4269,convert_element_type_1018,call_function,convert_element_type.default,backward,20,1,1,1,3912,2273,6
+4270,convert_element_type_1019,call_function,convert_element_type.default,backward,20,1,1,1,3905,2,3
+4271,add_193,call_function,add.Tensor,unknown,,2,2,1,3913,2272,10
+4272,dtype_cast_323,call_function,dtype_cast.default,backward,20,1,1,1,3906,1,3
+4273,alias_default_1435,call_function,alias.default,backward,20,1,1,0,3907,0,2
+4274,alias_default_918,call_function,alias.default,unknown,,1,1,3,3914,2271,4
+4275,einsum_default_303,call_function,einsum.default,backward,20,2,2,1,3915,3,5
+4276,permute_551,call_function,permute.default,backward,20,1,1,1,4,2267,3
+4277,einsum_default_304,call_function,einsum.default,backward,20,2,2,1,3916,2266,5
+4278,permute_552,call_function,permute.default,backward,20,1,1,1,3916,2,4
+4279,dtype_cast_324,call_function,dtype_cast.default,backward,20,1,1,1,3917,1,4
+4280,alias_default_1430,call_function,alias.default,backward,20,1,1,0,3918,0,3
+4281,view_810,call_function,view.default,backward,20,1,1,1,3917,2265,4
+4282,permute_553,call_function,permute.default,backward,20,1,1,1,3918,2264,4
+4283,_scaled_dot_product_flash_attention_backward_7,call_function,_scaled_dot_product_flash_attention_backward.default,backward,20,8,8,3,3922,2263,2
+4284,getitem_273,call_function,getitem,backward,20,1,1,1,3923,2236,2
+4285,getitem_274,call_function,getitem,backward,20,1,1,1,3923,2237,2
+4286,getitem_275,call_function,getitem,backward,20,1,1,1,3923,2230,2
+4287,permute_554,call_function,permute.default,backward,20,1,1,1,3924,2229,2
+4288,permute_555,call_function,permute.default,backward,20,1,1,1,3924,2236,2
+4289,permute_556,call_function,permute.default,backward,20,1,1,1,3924,2235,2
+4290,convert_element_type_1024,call_function,convert_element_type.default,backward,20,1,1,1,3925,2235,2
+4291,convert_element_type_1025,call_function,convert_element_type.default,backward,20,1,1,1,3925,2234,2
+4292,view_811,call_function,view.default,backward,20,1,1,1,3926,2234,2
+4293,view_as_complex_70,call_function,view_as_complex.default,backward,20,1,1,1,3927,2233,6
+4294,_conj_14,call_function,_conj.default,backward,20,1,1,1,4,2234,3
+4295,clone_62,call_function,clone.default,backward,20,1,1,1,5,2233,3
+4296,mul_356,call_function,mul.Tensor,backward,20,2,2,1,3930,2232,8
+4297,view_812,call_function,view.default,backward,20,1,1,1,3926,2233,2
+4298,view_as_complex_71,call_function,view_as_complex.default,backward,20,1,1,1,3927,2232,6
+4299,_conj_15,call_function,_conj.default,backward,20,1,1,1,4,2233,3
+4300,clone_63,call_function,clone.default,backward,20,1,1,1,5,2232,3
+4301,mul_357,call_function,mul.Tensor,backward,20,2,2,1,3930,2231,8
+4302,view_as_real_70,call_function,view_as_real.default,backward,20,1,1,1,3931,2231,6
+4303,view_813,call_function,view.default,backward,20,1,1,1,3932,2230,6
+4304,convert_element_type_1026,call_function,convert_element_type.default,backward,20,1,1,1,3933,2229,6
+4305,view_as_real_71,call_function,view_as_real.default,backward,20,1,1,1,3931,2230,6
+4306,view_814,call_function,view.default,backward,20,1,1,1,3932,2229,6
+4307,convert_element_type_1027,call_function,convert_element_type.default,backward,20,1,1,1,3933,2228,6
+4308,view_815,call_function,view.default,backward,20,1,1,1,3925,2228,2
+4309,view_816,call_function,view.default,backward,20,1,1,1,3934,2228,5
+4310,view_817,call_function,view.default,backward,20,1,1,1,3934,2227,5
+4311,alias_default_919,call_function,alias.default,backward,20,1,1,2,3926,2227,4
+4312,einsum_default_305,call_function,einsum.default,backward,20,2,2,1,3927,3,5
+4313,permute_559,call_function,permute.default,backward,20,1,1,1,4,2223,3
+4314,einsum_default_306,call_function,einsum.default,backward,20,2,2,1,3928,2222,5
+4315,permute_560,call_function,permute.default,backward,20,1,1,1,3928,2,4
+4316,dtype_cast_325,call_function,dtype_cast.default,backward,20,1,1,1,3929,1,4
+4317,alias_default_1429,call_function,alias.default,backward,20,1,1,0,3930,0,3
+4318,alias_default_920,call_function,alias.default,backward,20,1,1,2,3935,2227,4
+4319,einsum_default_307,call_function,einsum.default,backward,20,2,2,1,3936,3,5
+4320,permute_563,call_function,permute.default,backward,20,1,1,1,4,2223,3
+4321,einsum_default_308,call_function,einsum.default,backward,20,2,2,1,3937,2222,5
+4322,add_194,call_function,add.Tensor,unknown,,2,2,1,3944,2221,10
+4323,permute_564,call_function,permute.default,backward,20,1,1,1,3937,2,4
+4324,dtype_cast_326,call_function,dtype_cast.default,backward,20,1,1,1,3938,1,4
+4325,alias_default_1428,call_function,alias.default,backward,20,1,1,0,3939,0,3
+4326,alias_default_921,call_function,alias.default,backward,20,1,1,2,3935,2226,4
+4327,einsum_default_309,call_function,einsum.default,backward,20,2,2,1,3936,3,5
+4328,permute_567,call_function,permute.default,backward,20,1,1,1,4,2222,3
+4329,einsum_default_310,call_function,einsum.default,backward,20,2,2,1,3937,2221,5
+4330,add_195,call_function,add.Tensor,unknown,,2,2,1,3960,2220,10
+4331,permute_568,call_function,permute.default,backward,20,1,1,1,3937,2,4
+4332,dtype_cast_327,call_function,dtype_cast.default,backward,20,1,1,1,3938,1,4
+4333,alias_default_1427,call_function,alias.default,backward,20,1,1,0,3939,0,3
+4334,convert_element_type_1040,call_function,convert_element_type.default,backward,20,1,1,1,3961,2219,8
+4335,convert_element_type_1041,call_function,convert_element_type.default,backward,20,1,1,1,2209,2219,4
+4336,convert_element_type_1042,call_function,convert_element_type.default,backward,20,1,1,1,3,2213,2
+4337,alias_default_922,call_function,alias.default,backward,20,1,1,2,3962,2218,4
+4338,mul_358,call_function,mul.Tensor,backward,20,2,2,1,3964,2212,8
+4339,mul_359,call_function,mul.Tensor,backward,20,2,2,1,2217,2218,8
+4340,alias_default_923,call_function,alias.default,backward,20,1,1,2,3965,2211,4
+4341,alias_default_924,call_function,alias.default,backward,20,1,1,3,2218,2217,4
+4342,mul_360,call_function,mul.Tensor,backward,20,2,2,1,3969,2210,8
+4343,sum_33,call_function,sum.dim_IntList,backward,20,1,1,1,3970,2209,5
+4344,div_44,call_function,div.Tensor,backward,20,1,1,1,2219,2209,6
+4345,mul_361,call_function,mul.Tensor,backward,20,2,2,1,3972,2208,8
+4346,sub_24,call_function,sub.Tensor,backward,20,2,2,1,3973,2207,10
+4347,mul_362,call_function,mul.Tensor,backward,20,2,2,1,3974,2206,8
+4348,mul_363,call_function,mul.Tensor,backward,20,2,2,1,3966,4,8
+4349,sum_34,call_function,sum.dim_IntList,backward,20,1,1,1,3967,3,5
+4350,convert_element_type_1043,call_function,convert_element_type.default,backward,20,1,1,1,3975,2205,6
+4351,convert_element_type_1044,call_function,convert_element_type.default,backward,20,1,1,1,3968,2,3
+4352,add_196,call_function,add.Tensor,unknown,,2,2,1,3976,2204,10
+4353,dtype_cast_328,call_function,dtype_cast.default,backward,20,1,1,1,3969,1,3
+4354,alias_default_1434,call_function,alias.default,backward,20,1,1,0,3970,0,2
+4355,alias_default_925,call_function,alias.default,unknown,,1,1,3,3977,2203,4
+4356,einsum_default_311,call_function,einsum.default,backward,19,2,2,1,3978,3,5
+4357,permute_571,call_function,permute.default,backward,19,1,1,1,4,2199,3
+4358,einsum_default_312,call_function,einsum.default,backward,19,2,2,1,3979,2198,5
+4359,permute_572,call_function,permute.default,backward,19,1,1,1,3979,2,4
+4360,dtype_cast_329,call_function,dtype_cast.default,backward,19,1,1,1,3980,1,4
+4361,alias_default_1423,call_function,alias.default,backward,19,1,1,0,3981,0,3
+4362,alias_default_926,call_function,alias.default,backward,19,1,1,2,3980,2197,4
+4363,mul_364,call_function,mul.Tensor,backward,19,2,2,1,3981,2185,8
+4364,mul_365,call_function,mul.Tensor,backward,19,2,2,1,3981,2189,8
+4365,alias_default_927,call_function,alias.default,backward,19,1,1,2,3982,2184,4
+4366,einsum_default_313,call_function,einsum.default,backward,19,2,2,1,3983,3,5
+4367,permute_575,call_function,permute.default,backward,19,1,1,1,4,2180,3
+4368,einsum_default_314,call_function,einsum.default,backward,19,2,2,1,3984,2179,5
+4369,permute_576,call_function,permute.default,backward,19,1,1,1,3984,2,4
+4370,dtype_cast_330,call_function,dtype_cast.default,backward,19,1,1,1,3985,1,4
+4371,alias_default_1424,call_function,alias.default,backward,19,1,1,0,3986,0,3
+4372,convert_element_type_1053,call_function,convert_element_type.default,backward,19,1,1,1,3982,2188,6
+4373,convert_element_type_1054,call_function,convert_element_type.default,backward,19,1,1,1,2186,2198,4
+4374,alias_default_928,call_function,alias.default,backward,19,1,1,2,2187,2197,4
+4375,neg_36,call_function,neg.default,backward,19,1,1,1,2188,2196,8
+4376,exp_36,call_function,exp.default,backward,19,1,1,1,2189,2195,6
+4377,add_197,call_function,add.Tensor,backward,19,1,1,1,2190,2194,4
+4378,reciprocal_8,call_function,reciprocal.default,backward,19,1,1,1,2191,2193,4
+4379,mul_366,call_function,mul.Tensor,backward,19,1,1,1,2192,2192,6
+4380,alias_default_929,call_function,alias.default,backward,19,1,1,2,2193,2191,4
+4381,mul_367,call_function,mul.Tensor,backward,19,2,2,1,3991,2187,8
+4382,sub_25,call_function,sub.Tensor,backward,19,1,1,1,2194,2189,4
+4383,mul_368,call_function,mul.Tensor,backward,19,2,2,1,2195,2188,8
+4384,add_198,call_function,add.Tensor,backward,19,1,1,1,2196,2187,4
+4385,mul_369,call_function,mul.Tensor,backward,19,2,2,1,3995,2186,8
+4386,convert_element_type_1055,call_function,convert_element_type.default,backward,19,1,1,1,3996,2185,6
+4387,alias_default_930,call_function,alias.default,backward,19,1,1,2,3997,2184,4
+4388,einsum_default_315,call_function,einsum.default,backward,19,2,2,1,3998,3,5
+4389,permute_579,call_function,permute.default,backward,19,1,1,1,4,2180,3
+4390,einsum_default_316,call_function,einsum.default,backward,19,2,2,1,3999,2179,5
+4391,add_199,call_function,add.Tensor,unknown,,2,2,1,4004,2178,10
+4392,permute_580,call_function,permute.default,backward,19,1,1,1,3999,2,4
+4393,dtype_cast_331,call_function,dtype_cast.default,backward,19,1,1,1,4000,1,4
+4394,alias_default_1422,call_function,alias.default,backward,19,1,1,0,4001,0,3
+4395,convert_element_type_1060,call_function,convert_element_type.default,backward,19,1,1,1,4005,2177,8
+4396,convert_element_type_1061,call_function,convert_element_type.default,backward,19,1,1,1,2166,2177,4
+4397,convert_element_type_1062,call_function,convert_element_type.default,backward,19,1,1,1,3,2171,2
+4398,alias_default_931,call_function,alias.default,backward,19,1,1,2,4006,2176,4
+4399,mul_370,call_function,mul.Tensor,backward,19,2,2,1,4008,2170,8
+4400,mul_371,call_function,mul.Tensor,backward,19,2,2,1,2174,2176,8
+4401,alias_default_932,call_function,alias.default,backward,19,1,1,2,4009,2169,4
+4402,alias_default_933,call_function,alias.default,backward,19,1,1,3,2175,2175,4
+4403,mul_372,call_function,mul.Tensor,backward,19,2,2,1,4013,2168,8
+4404,sum_35,call_function,sum.dim_IntList,backward,19,1,1,1,4014,2167,5
+4405,div_45,call_function,div.Tensor,backward,19,1,1,1,2176,2167,6
+4406,mul_373,call_function,mul.Tensor,backward,19,2,2,1,4016,2166,8
+4407,sub_26,call_function,sub.Tensor,backward,19,2,2,1,4017,2165,10
+4408,mul_374,call_function,mul.Tensor,backward,19,2,2,1,4018,2164,8
+4409,mul_375,call_function,mul.Tensor,backward,19,2,2,1,4010,4,8
+4410,sum_36,call_function,sum.dim_IntList,backward,19,1,1,1,4011,3,5
+4411,convert_element_type_1063,call_function,convert_element_type.default,backward,19,1,1,1,4019,2163,6
+4412,convert_element_type_1064,call_function,convert_element_type.default,backward,19,1,1,1,4012,2,3
+4413,add_200,call_function,add.Tensor,unknown,,2,2,1,4020,2162,10
+4414,dtype_cast_332,call_function,dtype_cast.default,backward,19,1,1,1,4013,1,3
+4415,alias_default_1426,call_function,alias.default,backward,19,1,1,0,4014,0,2
+4416,alias_default_934,call_function,alias.default,unknown,,1,1,3,4021,2161,4
+4417,einsum_default_317,call_function,einsum.default,backward,19,2,2,1,4022,3,5
+4418,permute_583,call_function,permute.default,backward,19,1,1,1,4,2157,3
+4419,einsum_default_318,call_function,einsum.default,backward,19,2,2,1,4023,2156,5
+4420,permute_584,call_function,permute.default,backward,19,1,1,1,4023,2,4
+4421,dtype_cast_333,call_function,dtype_cast.default,backward,19,1,1,1,4024,1,4
+4422,alias_default_1421,call_function,alias.default,backward,19,1,1,0,4025,0,3
+4423,view_832,call_function,view.default,backward,19,1,1,1,4024,2155,4
+4424,permute_585,call_function,permute.default,backward,19,1,1,1,4025,2154,4
+4425,_scaled_dot_product_flash_attention_backward_8,call_function,_scaled_dot_product_flash_attention_backward.default,backward,19,8,8,3,4029,2153,2
+4426,getitem_276,call_function,getitem,backward,19,1,1,1,4030,2126,2
+4427,getitem_277,call_function,getitem,backward,19,1,1,1,4030,2127,2
+4428,getitem_278,call_function,getitem,backward,19,1,1,1,4030,2120,2
+4429,permute_586,call_function,permute.default,backward,19,1,1,1,4031,2119,2
+4430,permute_587,call_function,permute.default,backward,19,1,1,1,4031,2126,2
+4431,permute_588,call_function,permute.default,backward,19,1,1,1,4031,2125,2
+4432,convert_element_type_1069,call_function,convert_element_type.default,backward,19,1,1,1,4032,2125,2
+4433,convert_element_type_1070,call_function,convert_element_type.default,backward,19,1,1,1,4032,2124,2
+4434,view_833,call_function,view.default,backward,19,1,1,1,4033,2124,2
+4435,view_as_complex_72,call_function,view_as_complex.default,backward,19,1,1,1,4034,2123,6
+4436,_conj_16,call_function,_conj.default,backward,19,1,1,1,4,2124,3
+4437,clone_70,call_function,clone.default,backward,19,1,1,1,5,2123,3
+4438,mul_376,call_function,mul.Tensor,backward,19,2,2,1,4037,2122,8
+4439,view_834,call_function,view.default,backward,19,1,1,1,4033,2123,2
+4440,view_as_complex_73,call_function,view_as_complex.default,backward,19,1,1,1,4034,2122,6
+4441,_conj_17,call_function,_conj.default,backward,19,1,1,1,4,2123,3
+4442,clone_71,call_function,clone.default,backward,19,1,1,1,5,2122,3
+4443,mul_377,call_function,mul.Tensor,backward,19,2,2,1,4037,2121,8
+4444,view_as_real_72,call_function,view_as_real.default,backward,19,1,1,1,4038,2121,6
+4445,view_835,call_function,view.default,backward,19,1,1,1,4039,2120,6
+4446,convert_element_type_1071,call_function,convert_element_type.default,backward,19,1,1,1,4040,2119,6
+4447,view_as_real_73,call_function,view_as_real.default,backward,19,1,1,1,4038,2120,6
+4448,view_836,call_function,view.default,backward,19,1,1,1,4039,2119,6
+4449,convert_element_type_1072,call_function,convert_element_type.default,backward,19,1,1,1,4040,2118,6
+4450,view_837,call_function,view.default,backward,19,1,1,1,4032,2118,2
+4451,view_838,call_function,view.default,backward,19,1,1,1,4041,2118,5
+4452,view_839,call_function,view.default,backward,19,1,1,1,4041,2117,5
+4453,alias_default_935,call_function,alias.default,backward,19,1,1,2,4033,2117,4
+4454,einsum_default_319,call_function,einsum.default,backward,19,2,2,1,4034,3,5
+4455,permute_591,call_function,permute.default,backward,19,1,1,1,4,2113,3
+4456,einsum_default_320,call_function,einsum.default,backward,19,2,2,1,4035,2112,5
+4457,permute_592,call_function,permute.default,backward,19,1,1,1,4035,2,4
+4458,dtype_cast_334,call_function,dtype_cast.default,backward,19,1,1,1,4036,1,4
+4459,alias_default_1420,call_function,alias.default,backward,19,1,1,0,4037,0,3
+4460,alias_default_936,call_function,alias.default,backward,19,1,1,2,4042,2117,4
+4461,einsum_default_321,call_function,einsum.default,backward,19,2,2,1,4043,3,5
+4462,permute_595,call_function,permute.default,backward,19,1,1,1,4,2113,3
+4463,einsum_default_322,call_function,einsum.default,backward,19,2,2,1,4044,2112,5
+4464,add_201,call_function,add.Tensor,unknown,,2,2,1,4051,2111,10
+4465,permute_596,call_function,permute.default,backward,19,1,1,1,4044,2,4
+4466,dtype_cast_335,call_function,dtype_cast.default,backward,19,1,1,1,4045,1,4
+4467,alias_default_1419,call_function,alias.default,backward,19,1,1,0,4046,0,3
+4468,alias_default_937,call_function,alias.default,backward,19,1,1,2,4042,2116,4
+4469,einsum_default_323,call_function,einsum.default,backward,19,2,2,1,4043,3,5
+4470,permute_599,call_function,permute.default,backward,19,1,1,1,4,2112,3
+4471,einsum_default_324,call_function,einsum.default,backward,19,2,2,1,4044,2111,5
+4472,add_202,call_function,add.Tensor,unknown,,2,2,1,4067,2110,10
+4473,permute_600,call_function,permute.default,backward,19,1,1,1,4044,2,4
+4474,dtype_cast_336,call_function,dtype_cast.default,backward,19,1,1,1,4045,1,4
+4475,alias_default_1418,call_function,alias.default,backward,19,1,1,0,4046,0,3
+4476,convert_element_type_1085,call_function,convert_element_type.default,backward,19,1,1,1,4068,2109,8
+4477,convert_element_type_1086,call_function,convert_element_type.default,backward,19,1,1,1,2099,2109,4
+4478,convert_element_type_1087,call_function,convert_element_type.default,backward,19,1,1,1,3,2103,2
+4479,alias_default_938,call_function,alias.default,backward,19,1,1,2,4069,2108,4
+4480,mul_378,call_function,mul.Tensor,backward,19,2,2,1,4071,2102,8
+4481,mul_379,call_function,mul.Tensor,backward,19,2,2,1,2107,2108,8
+4482,alias_default_939,call_function,alias.default,backward,19,1,1,2,4072,2101,4
+4483,alias_default_940,call_function,alias.default,backward,19,1,1,3,2108,2107,4
+4484,mul_380,call_function,mul.Tensor,backward,19,2,2,1,4076,2100,8
+4485,sum_37,call_function,sum.dim_IntList,backward,19,1,1,1,4077,2099,5
+4486,div_46,call_function,div.Tensor,backward,19,1,1,1,2109,2099,6
+4487,mul_381,call_function,mul.Tensor,backward,19,2,2,1,4079,2098,8
+4488,sub_27,call_function,sub.Tensor,backward,19,2,2,1,4080,2097,10
+4489,mul_382,call_function,mul.Tensor,backward,19,2,2,1,4081,2096,8
+4490,mul_383,call_function,mul.Tensor,backward,19,2,2,1,4073,4,8
+4491,sum_38,call_function,sum.dim_IntList,backward,19,1,1,1,4074,3,5
+4492,convert_element_type_1088,call_function,convert_element_type.default,backward,19,1,1,1,4082,2095,6
+4493,convert_element_type_1089,call_function,convert_element_type.default,backward,19,1,1,1,4075,2,3
+4494,add_203,call_function,add.Tensor,unknown,,2,2,1,4083,2094,10
+4495,dtype_cast_337,call_function,dtype_cast.default,backward,19,1,1,1,4076,1,3
+4496,alias_default_1425,call_function,alias.default,backward,19,1,1,0,4077,0,2
+4497,alias_default_941,call_function,alias.default,unknown,,1,1,3,4084,2093,4
+4498,einsum_default_325,call_function,einsum.default,backward,18,2,2,1,4085,3,5
+4499,permute_603,call_function,permute.default,backward,18,1,1,1,4,2089,3
+4500,einsum_default_326,call_function,einsum.default,backward,18,2,2,1,4086,2088,5
+4501,permute_604,call_function,permute.default,backward,18,1,1,1,4086,2,4
+4502,dtype_cast_338,call_function,dtype_cast.default,backward,18,1,1,1,4087,1,4
+4503,alias_default_1414,call_function,alias.default,backward,18,1,1,0,4088,0,3
+4504,alias_default_942,call_function,alias.default,backward,18,1,1,2,4087,2087,4
+4505,mul_384,call_function,mul.Tensor,backward,18,2,2,1,4088,2075,8
+4506,mul_385,call_function,mul.Tensor,backward,18,2,2,1,4088,2079,8
+4507,alias_default_943,call_function,alias.default,backward,18,1,1,2,4089,2074,4
+4508,einsum_default_327,call_function,einsum.default,backward,18,2,2,1,4090,3,5
+4509,permute_607,call_function,permute.default,backward,18,1,1,1,4,2070,3
+4510,einsum_default_328,call_function,einsum.default,backward,18,2,2,1,4091,2069,5
+4511,permute_608,call_function,permute.default,backward,18,1,1,1,4091,2,4
+4512,dtype_cast_339,call_function,dtype_cast.default,backward,18,1,1,1,4092,1,4
+4513,alias_default_1415,call_function,alias.default,backward,18,1,1,0,4093,0,3
+4514,convert_element_type_1098,call_function,convert_element_type.default,backward,18,1,1,1,4089,2078,6
+4515,convert_element_type_1099,call_function,convert_element_type.default,backward,18,1,1,1,2076,2088,4
+4516,alias_default_944,call_function,alias.default,backward,18,1,1,2,2077,2087,4
+4517,neg_37,call_function,neg.default,backward,18,1,1,1,2078,2086,8
+4518,exp_37,call_function,exp.default,backward,18,1,1,1,2079,2085,6
+4519,add_204,call_function,add.Tensor,backward,18,1,1,1,2080,2084,4
+4520,reciprocal_9,call_function,reciprocal.default,backward,18,1,1,1,2081,2083,4
+4521,mul_386,call_function,mul.Tensor,backward,18,1,1,1,2082,2082,6
+4522,alias_default_945,call_function,alias.default,backward,18,1,1,2,2083,2081,4
+4523,mul_387,call_function,mul.Tensor,backward,18,2,2,1,4098,2077,8
+4524,sub_28,call_function,sub.Tensor,backward,18,1,1,1,2084,2079,4
+4525,mul_388,call_function,mul.Tensor,backward,18,2,2,1,2085,2078,8
+4526,add_205,call_function,add.Tensor,backward,18,1,1,1,2086,2077,4
+4527,mul_389,call_function,mul.Tensor,backward,18,2,2,1,4102,2076,8
+4528,convert_element_type_1100,call_function,convert_element_type.default,backward,18,1,1,1,4103,2075,6
+4529,alias_default_946,call_function,alias.default,backward,18,1,1,2,4104,2074,4
+4530,einsum_default_329,call_function,einsum.default,backward,18,2,2,1,4105,3,5
+4531,permute_611,call_function,permute.default,backward,18,1,1,1,4,2070,3
+4532,einsum_default_330,call_function,einsum.default,backward,18,2,2,1,4106,2069,5
+4533,add_206,call_function,add.Tensor,unknown,,2,2,1,4111,2068,10
+4534,permute_612,call_function,permute.default,backward,18,1,1,1,4106,2,4
+4535,dtype_cast_340,call_function,dtype_cast.default,backward,18,1,1,1,4107,1,4
+4536,alias_default_1413,call_function,alias.default,backward,18,1,1,0,4108,0,3
+4537,convert_element_type_1105,call_function,convert_element_type.default,backward,18,1,1,1,4112,2067,8
+4538,convert_element_type_1106,call_function,convert_element_type.default,backward,18,1,1,1,2056,2067,4
+4539,convert_element_type_1107,call_function,convert_element_type.default,backward,18,1,1,1,3,2061,2
+4540,alias_default_947,call_function,alias.default,backward,18,1,1,2,4113,2066,4
+4541,mul_390,call_function,mul.Tensor,backward,18,2,2,1,4115,2060,8
+4542,mul_391,call_function,mul.Tensor,backward,18,2,2,1,2064,2066,8
+4543,alias_default_948,call_function,alias.default,backward,18,1,1,2,4116,2059,4
+4544,alias_default_949,call_function,alias.default,backward,18,1,1,3,2065,2065,4
+4545,mul_392,call_function,mul.Tensor,backward,18,2,2,1,4120,2058,8
+4546,sum_39,call_function,sum.dim_IntList,backward,18,1,1,1,4121,2057,5
+4547,div_47,call_function,div.Tensor,backward,18,1,1,1,2066,2057,6
+4548,mul_393,call_function,mul.Tensor,backward,18,2,2,1,4123,2056,8
+4549,sub_29,call_function,sub.Tensor,backward,18,2,2,1,4124,2055,10
+4550,mul_394,call_function,mul.Tensor,backward,18,2,2,1,4125,2054,8
+4551,mul_395,call_function,mul.Tensor,backward,18,2,2,1,4117,4,8
+4552,sum_40,call_function,sum.dim_IntList,backward,18,1,1,1,4118,3,5
+4553,convert_element_type_1108,call_function,convert_element_type.default,backward,18,1,1,1,4126,2053,6
+4554,convert_element_type_1109,call_function,convert_element_type.default,backward,18,1,1,1,4119,2,3
+4555,add_207,call_function,add.Tensor,unknown,,2,2,1,4127,2052,10
+4556,dtype_cast_341,call_function,dtype_cast.default,backward,18,1,1,1,4120,1,3
+4557,alias_default_1417,call_function,alias.default,backward,18,1,1,0,4121,0,2
+4558,alias_default_950,call_function,alias.default,unknown,,1,1,3,4128,2051,4
+4559,einsum_default_331,call_function,einsum.default,backward,18,2,2,1,4129,3,5
+4560,permute_615,call_function,permute.default,backward,18,1,1,1,4,2047,3
+4561,einsum_default_332,call_function,einsum.default,backward,18,2,2,1,4130,2046,5
+4562,permute_616,call_function,permute.default,backward,18,1,1,1,4130,2,4
+4563,dtype_cast_342,call_function,dtype_cast.default,backward,18,1,1,1,4131,1,4
+4564,alias_default_1412,call_function,alias.default,backward,18,1,1,0,4132,0,3
+4565,view_854,call_function,view.default,backward,18,1,1,1,4131,2045,4
+4566,permute_617,call_function,permute.default,backward,18,1,1,1,4132,2044,4
+4567,_scaled_dot_product_flash_attention_backward_9,call_function,_scaled_dot_product_flash_attention_backward.default,backward,18,8,8,3,4136,2043,2
+4568,getitem_279,call_function,getitem,backward,18,1,1,1,4137,2016,2
+4569,getitem_280,call_function,getitem,backward,18,1,1,1,4137,2017,2
+4570,getitem_281,call_function,getitem,backward,18,1,1,1,4137,2010,2
+4571,permute_618,call_function,permute.default,backward,18,1,1,1,4138,2009,2
+4572,permute_619,call_function,permute.default,backward,18,1,1,1,4138,2016,2
+4573,permute_620,call_function,permute.default,backward,18,1,1,1,4138,2015,2
+4574,convert_element_type_1114,call_function,convert_element_type.default,backward,18,1,1,1,4139,2015,2
+4575,convert_element_type_1115,call_function,convert_element_type.default,backward,18,1,1,1,4139,2014,2
+4576,view_855,call_function,view.default,backward,18,1,1,1,4140,2014,2
+4577,view_as_complex_74,call_function,view_as_complex.default,backward,18,1,1,1,4141,2013,6
+4578,_conj_18,call_function,_conj.default,backward,18,1,1,1,4,2014,3
+4579,clone_78,call_function,clone.default,backward,18,1,1,1,5,2013,3
+4580,mul_396,call_function,mul.Tensor,backward,18,2,2,1,4144,2012,8
+4581,view_856,call_function,view.default,backward,18,1,1,1,4140,2013,2
+4582,view_as_complex_75,call_function,view_as_complex.default,backward,18,1,1,1,4141,2012,6
+4583,_conj_19,call_function,_conj.default,backward,18,1,1,1,4,2013,3
+4584,clone_79,call_function,clone.default,backward,18,1,1,1,5,2012,3
+4585,mul_397,call_function,mul.Tensor,backward,18,2,2,1,4144,2011,8
+4586,view_as_real_74,call_function,view_as_real.default,backward,18,1,1,1,4145,2011,6
+4587,view_857,call_function,view.default,backward,18,1,1,1,4146,2010,6
+4588,convert_element_type_1116,call_function,convert_element_type.default,backward,18,1,1,1,4147,2009,6
+4589,view_as_real_75,call_function,view_as_real.default,backward,18,1,1,1,4145,2010,6
+4590,view_858,call_function,view.default,backward,18,1,1,1,4146,2009,6
+4591,convert_element_type_1117,call_function,convert_element_type.default,backward,18,1,1,1,4147,2008,6
+4592,view_859,call_function,view.default,backward,18,1,1,1,4139,2008,2
+4593,view_860,call_function,view.default,backward,18,1,1,1,4148,2008,5
+4594,view_861,call_function,view.default,backward,18,1,1,1,4148,2007,5
+4595,alias_default_951,call_function,alias.default,backward,18,1,1,2,4140,2007,4
+4596,einsum_default_333,call_function,einsum.default,backward,18,2,2,1,4141,3,5
+4597,permute_623,call_function,permute.default,backward,18,1,1,1,4,2003,3
+4598,einsum_default_334,call_function,einsum.default,backward,18,2,2,1,4142,2002,5
+4599,permute_624,call_function,permute.default,backward,18,1,1,1,4142,2,4
+4600,dtype_cast_343,call_function,dtype_cast.default,backward,18,1,1,1,4143,1,4
+4601,alias_default_1411,call_function,alias.default,backward,18,1,1,0,4144,0,3
+4602,alias_default_952,call_function,alias.default,backward,18,1,1,2,4149,2007,4
+4603,einsum_default_335,call_function,einsum.default,backward,18,2,2,1,4150,3,5
+4604,permute_627,call_function,permute.default,backward,18,1,1,1,4,2003,3
+4605,einsum_default_336,call_function,einsum.default,backward,18,2,2,1,4151,2002,5
+4606,add_208,call_function,add.Tensor,unknown,,2,2,1,4158,2001,10
+4607,permute_628,call_function,permute.default,backward,18,1,1,1,4151,2,4
+4608,dtype_cast_344,call_function,dtype_cast.default,backward,18,1,1,1,4152,1,4
+4609,alias_default_1410,call_function,alias.default,backward,18,1,1,0,4153,0,3
+4610,alias_default_953,call_function,alias.default,backward,18,1,1,2,4149,2006,4
+4611,einsum_default_337,call_function,einsum.default,backward,18,2,2,1,4150,3,5
+4612,permute_631,call_function,permute.default,backward,18,1,1,1,4,2002,3
+4613,einsum_default_338,call_function,einsum.default,backward,18,2,2,1,4151,2001,5
+4614,add_209,call_function,add.Tensor,unknown,,2,2,1,4174,2000,10
+4615,permute_632,call_function,permute.default,backward,18,1,1,1,4151,2,4
+4616,dtype_cast_345,call_function,dtype_cast.default,backward,18,1,1,1,4152,1,4
+4617,alias_default_1409,call_function,alias.default,backward,18,1,1,0,4153,0,3
+4618,convert_element_type_1130,call_function,convert_element_type.default,backward,18,1,1,1,4175,1999,8
+4619,convert_element_type_1131,call_function,convert_element_type.default,backward,18,1,1,1,1989,1999,4
+4620,convert_element_type_1132,call_function,convert_element_type.default,backward,18,1,1,1,3,1993,2
+4621,alias_default_954,call_function,alias.default,backward,18,1,1,2,4176,1998,4
+4622,mul_398,call_function,mul.Tensor,backward,18,2,2,1,4178,1992,8
+4623,mul_399,call_function,mul.Tensor,backward,18,2,2,1,1997,1998,8
+4624,alias_default_955,call_function,alias.default,backward,18,1,1,2,4179,1991,4
+4625,alias_default_956,call_function,alias.default,backward,18,1,1,3,1998,1997,4
+4626,mul_400,call_function,mul.Tensor,backward,18,2,2,1,4183,1990,8
+4627,sum_41,call_function,sum.dim_IntList,backward,18,1,1,1,4184,1989,5
+4628,div_48,call_function,div.Tensor,backward,18,1,1,1,1999,1989,6
+4629,mul_401,call_function,mul.Tensor,backward,18,2,2,1,4186,1988,8
+4630,sub_30,call_function,sub.Tensor,backward,18,2,2,1,4187,1987,10
+4631,mul_402,call_function,mul.Tensor,backward,18,2,2,1,4188,1986,8
+4632,mul_403,call_function,mul.Tensor,backward,18,2,2,1,4180,4,8
+4633,sum_42,call_function,sum.dim_IntList,backward,18,1,1,1,4181,3,5
+4634,convert_element_type_1133,call_function,convert_element_type.default,backward,18,1,1,1,4189,1985,6
+4635,convert_element_type_1134,call_function,convert_element_type.default,backward,18,1,1,1,4182,2,3
+4636,add_210,call_function,add.Tensor,unknown,,2,2,1,4190,1984,10
+4637,dtype_cast_346,call_function,dtype_cast.default,backward,18,1,1,1,4183,1,3
+4638,alias_default_1416,call_function,alias.default,backward,18,1,1,0,4184,0,2
+4639,alias_default_957,call_function,alias.default,unknown,,1,1,3,4191,1983,4
+4640,einsum_default_339,call_function,einsum.default,backward,17,2,2,1,4192,3,5
+4641,permute_635,call_function,permute.default,backward,17,1,1,1,4,1979,3
+4642,einsum_default_340,call_function,einsum.default,backward,17,2,2,1,4193,1978,5
+4643,permute_636,call_function,permute.default,backward,17,1,1,1,4193,2,4
+4644,dtype_cast_347,call_function,dtype_cast.default,backward,17,1,1,1,4194,1,4
+4645,alias_default_1405,call_function,alias.default,backward,17,1,1,0,4195,0,3
+4646,alias_default_958,call_function,alias.default,backward,17,1,1,2,4194,1977,4
+4647,mul_404,call_function,mul.Tensor,backward,17,2,2,1,4195,1965,8
+4648,mul_405,call_function,mul.Tensor,backward,17,2,2,1,4195,1969,8
+4649,alias_default_959,call_function,alias.default,backward,17,1,1,2,4196,1964,4
+4650,einsum_default_341,call_function,einsum.default,backward,17,2,2,1,4197,3,5
+4651,permute_639,call_function,permute.default,backward,17,1,1,1,4,1960,3
+4652,einsum_default_342,call_function,einsum.default,backward,17,2,2,1,4198,1959,5
+4653,permute_640,call_function,permute.default,backward,17,1,1,1,4198,2,4
+4654,dtype_cast_348,call_function,dtype_cast.default,backward,17,1,1,1,4199,1,4
+4655,alias_default_1406,call_function,alias.default,backward,17,1,1,0,4200,0,3
+4656,convert_element_type_1143,call_function,convert_element_type.default,backward,17,1,1,1,4196,1968,6
+4657,convert_element_type_1144,call_function,convert_element_type.default,backward,17,1,1,1,1966,1978,4
+4658,alias_default_960,call_function,alias.default,backward,17,1,1,2,1967,1977,4
+4659,neg_38,call_function,neg.default,backward,17,1,1,1,1968,1976,8
+4660,exp_38,call_function,exp.default,backward,17,1,1,1,1969,1975,6
+4661,add_211,call_function,add.Tensor,backward,17,1,1,1,1970,1974,4
+4662,reciprocal_10,call_function,reciprocal.default,backward,17,1,1,1,1971,1973,4
+4663,mul_406,call_function,mul.Tensor,backward,17,1,1,1,1972,1972,6
+4664,alias_default_961,call_function,alias.default,backward,17,1,1,2,1973,1971,4
+4665,mul_407,call_function,mul.Tensor,backward,17,2,2,1,4205,1967,8
+4666,sub_31,call_function,sub.Tensor,backward,17,1,1,1,1974,1969,4
+4667,mul_408,call_function,mul.Tensor,backward,17,2,2,1,1975,1968,8
+4668,add_212,call_function,add.Tensor,backward,17,1,1,1,1976,1967,4
+4669,mul_409,call_function,mul.Tensor,backward,17,2,2,1,4209,1966,8
+4670,convert_element_type_1145,call_function,convert_element_type.default,backward,17,1,1,1,4210,1965,6
+4671,alias_default_962,call_function,alias.default,backward,17,1,1,2,4211,1964,4
+4672,einsum_default_343,call_function,einsum.default,backward,17,2,2,1,4212,3,5
+4673,permute_643,call_function,permute.default,backward,17,1,1,1,4,1960,3
+4674,einsum_default_344,call_function,einsum.default,backward,17,2,2,1,4213,1959,5
+4675,add_213,call_function,add.Tensor,unknown,,2,2,1,4218,1958,10
+4676,permute_644,call_function,permute.default,backward,17,1,1,1,4213,2,4
+4677,dtype_cast_349,call_function,dtype_cast.default,backward,17,1,1,1,4214,1,4
+4678,alias_default_1404,call_function,alias.default,backward,17,1,1,0,4215,0,3
+4679,convert_element_type_1150,call_function,convert_element_type.default,backward,17,1,1,1,4219,1957,8
+4680,convert_element_type_1151,call_function,convert_element_type.default,backward,17,1,1,1,1946,1957,4
+4681,convert_element_type_1152,call_function,convert_element_type.default,backward,17,1,1,1,3,1951,2
+4682,alias_default_963,call_function,alias.default,backward,17,1,1,2,4220,1956,4
+4683,mul_410,call_function,mul.Tensor,backward,17,2,2,1,4222,1950,8
+4684,mul_411,call_function,mul.Tensor,backward,17,2,2,1,1954,1956,8
+4685,alias_default_964,call_function,alias.default,backward,17,1,1,2,4223,1949,4
+4686,alias_default_965,call_function,alias.default,backward,17,1,1,3,1955,1955,4
+4687,mul_412,call_function,mul.Tensor,backward,17,2,2,1,4227,1948,8
+4688,sum_43,call_function,sum.dim_IntList,backward,17,1,1,1,4228,1947,5
+4689,div_49,call_function,div.Tensor,backward,17,1,1,1,1956,1947,6
+4690,mul_413,call_function,mul.Tensor,backward,17,2,2,1,4230,1946,8
+4691,sub_32,call_function,sub.Tensor,backward,17,2,2,1,4231,1945,10
+4692,mul_414,call_function,mul.Tensor,backward,17,2,2,1,4232,1944,8
+4693,mul_415,call_function,mul.Tensor,backward,17,2,2,1,4224,4,8
+4694,sum_44,call_function,sum.dim_IntList,backward,17,1,1,1,4225,3,5
+4695,convert_element_type_1153,call_function,convert_element_type.default,backward,17,1,1,1,4233,1943,6
+4696,convert_element_type_1154,call_function,convert_element_type.default,backward,17,1,1,1,4226,2,3
+4697,add_214,call_function,add.Tensor,unknown,,2,2,1,4234,1942,10
+4698,dtype_cast_350,call_function,dtype_cast.default,backward,17,1,1,1,4227,1,3
+4699,alias_default_1408,call_function,alias.default,backward,17,1,1,0,4228,0,2
+4700,alias_default_966,call_function,alias.default,unknown,,1,1,3,4235,1941,4
+4701,einsum_default_345,call_function,einsum.default,backward,17,2,2,1,4236,3,5
+4702,permute_647,call_function,permute.default,backward,17,1,1,1,4,1937,3
+4703,einsum_default_346,call_function,einsum.default,backward,17,2,2,1,4237,1936,5
+4704,permute_648,call_function,permute.default,backward,17,1,1,1,4237,2,4
+4705,dtype_cast_351,call_function,dtype_cast.default,backward,17,1,1,1,4238,1,4
+4706,alias_default_1403,call_function,alias.default,backward,17,1,1,0,4239,0,3
+4707,view_876,call_function,view.default,backward,17,1,1,1,4238,1935,4
+4708,permute_649,call_function,permute.default,backward,17,1,1,1,4239,1934,4
+4709,_scaled_dot_product_flash_attention_backward_10,call_function,_scaled_dot_product_flash_attention_backward.default,backward,17,8,8,3,4243,1933,2
+4710,getitem_282,call_function,getitem,backward,17,1,1,1,4244,1906,2
+4711,getitem_283,call_function,getitem,backward,17,1,1,1,4244,1907,2
+4712,getitem_284,call_function,getitem,backward,17,1,1,1,4244,1900,2
+4713,permute_650,call_function,permute.default,backward,17,1,1,1,4245,1899,2
+4714,permute_651,call_function,permute.default,backward,17,1,1,1,4245,1906,2
+4715,permute_652,call_function,permute.default,backward,17,1,1,1,4245,1905,2
+4716,convert_element_type_1159,call_function,convert_element_type.default,backward,17,1,1,1,4246,1905,2
+4717,convert_element_type_1160,call_function,convert_element_type.default,backward,17,1,1,1,4246,1904,2
+4718,view_877,call_function,view.default,backward,17,1,1,1,4247,1904,2
+4719,view_as_complex_76,call_function,view_as_complex.default,backward,17,1,1,1,4248,1903,6
+4720,_conj_20,call_function,_conj.default,backward,17,1,1,1,4,1904,3
+4721,clone_86,call_function,clone.default,backward,17,1,1,1,5,1903,3
+4722,mul_416,call_function,mul.Tensor,backward,17,2,2,1,4251,1902,8
+4723,view_878,call_function,view.default,backward,17,1,1,1,4247,1903,2
+4724,view_as_complex_77,call_function,view_as_complex.default,backward,17,1,1,1,4248,1902,6
+4725,_conj_21,call_function,_conj.default,backward,17,1,1,1,4,1903,3
+4726,clone_87,call_function,clone.default,backward,17,1,1,1,5,1902,3
+4727,mul_417,call_function,mul.Tensor,backward,17,2,2,1,4251,1901,8
+4728,view_as_real_76,call_function,view_as_real.default,backward,17,1,1,1,4252,1901,6
+4729,view_879,call_function,view.default,backward,17,1,1,1,4253,1900,6
+4730,convert_element_type_1161,call_function,convert_element_type.default,backward,17,1,1,1,4254,1899,6
+4731,view_as_real_77,call_function,view_as_real.default,backward,17,1,1,1,4252,1900,6
+4732,view_880,call_function,view.default,backward,17,1,1,1,4253,1899,6
+4733,convert_element_type_1162,call_function,convert_element_type.default,backward,17,1,1,1,4254,1898,6
+4734,view_881,call_function,view.default,backward,17,1,1,1,4246,1898,2
+4735,view_882,call_function,view.default,backward,17,1,1,1,4255,1898,5
+4736,view_883,call_function,view.default,backward,17,1,1,1,4255,1897,5
+4737,alias_default_967,call_function,alias.default,backward,17,1,1,2,4247,1897,4
+4738,einsum_default_347,call_function,einsum.default,backward,17,2,2,1,4248,3,5
+4739,permute_655,call_function,permute.default,backward,17,1,1,1,4,1893,3
+4740,einsum_default_348,call_function,einsum.default,backward,17,2,2,1,4249,1892,5
+4741,permute_656,call_function,permute.default,backward,17,1,1,1,4249,2,4
+4742,dtype_cast_352,call_function,dtype_cast.default,backward,17,1,1,1,4250,1,4
+4743,alias_default_1402,call_function,alias.default,backward,17,1,1,0,4251,0,3
+4744,alias_default_968,call_function,alias.default,backward,17,1,1,2,4256,1897,4
+4745,einsum_default_349,call_function,einsum.default,backward,17,2,2,1,4257,3,5
+4746,permute_659,call_function,permute.default,backward,17,1,1,1,4,1893,3
+4747,einsum_default_350,call_function,einsum.default,backward,17,2,2,1,4258,1892,5
+4748,add_215,call_function,add.Tensor,unknown,,2,2,1,4265,1891,10
+4749,permute_660,call_function,permute.default,backward,17,1,1,1,4258,2,4
+4750,dtype_cast_353,call_function,dtype_cast.default,backward,17,1,1,1,4259,1,4
+4751,alias_default_1401,call_function,alias.default,backward,17,1,1,0,4260,0,3
+4752,alias_default_969,call_function,alias.default,backward,17,1,1,2,4256,1896,4
+4753,einsum_default_351,call_function,einsum.default,backward,17,2,2,1,4257,3,5
+4754,permute_663,call_function,permute.default,backward,17,1,1,1,4,1892,3
+4755,einsum_default_352,call_function,einsum.default,backward,17,2,2,1,4258,1891,5
+4756,add_216,call_function,add.Tensor,unknown,,2,2,1,4281,1890,10
+4757,permute_664,call_function,permute.default,backward,17,1,1,1,4258,2,4
+4758,dtype_cast_354,call_function,dtype_cast.default,backward,17,1,1,1,4259,1,4
+4759,alias_default_1400,call_function,alias.default,backward,17,1,1,0,4260,0,3
+4760,convert_element_type_1175,call_function,convert_element_type.default,backward,17,1,1,1,4282,1889,8
+4761,convert_element_type_1176,call_function,convert_element_type.default,backward,17,1,1,1,1879,1889,4
+4762,convert_element_type_1177,call_function,convert_element_type.default,backward,17,1,1,1,3,1883,2
+4763,alias_default_970,call_function,alias.default,backward,17,1,1,2,4283,1888,4
+4764,mul_418,call_function,mul.Tensor,backward,17,2,2,1,4285,1882,8
+4765,mul_419,call_function,mul.Tensor,backward,17,2,2,1,1887,1888,8
+4766,alias_default_971,call_function,alias.default,backward,17,1,1,2,4286,1881,4
+4767,alias_default_972,call_function,alias.default,backward,17,1,1,3,1888,1887,4
+4768,mul_420,call_function,mul.Tensor,backward,17,2,2,1,4290,1880,8
+4769,sum_45,call_function,sum.dim_IntList,backward,17,1,1,1,4291,1879,5
+4770,div_50,call_function,div.Tensor,backward,17,1,1,1,1889,1879,6
+4771,mul_421,call_function,mul.Tensor,backward,17,2,2,1,4293,1878,8
+4772,sub_33,call_function,sub.Tensor,backward,17,2,2,1,4294,1877,10
+4773,mul_422,call_function,mul.Tensor,backward,17,2,2,1,4295,1876,8
+4774,mul_423,call_function,mul.Tensor,backward,17,2,2,1,4287,4,8
+4775,sum_46,call_function,sum.dim_IntList,backward,17,1,1,1,4288,3,5
+4776,convert_element_type_1178,call_function,convert_element_type.default,backward,17,1,1,1,4296,1875,6
+4777,convert_element_type_1179,call_function,convert_element_type.default,backward,17,1,1,1,4289,2,3
+4778,add_217,call_function,add.Tensor,unknown,,2,2,1,4297,1874,10
+4779,dtype_cast_355,call_function,dtype_cast.default,backward,17,1,1,1,4290,1,3
+4780,alias_default_1407,call_function,alias.default,backward,17,1,1,0,4291,0,2
+4781,alias_default_973,call_function,alias.default,unknown,,1,1,3,4298,1873,4
+4782,einsum_default_353,call_function,einsum.default,backward,16,2,2,1,4299,3,5
+4783,permute_667,call_function,permute.default,backward,16,1,1,1,4,1869,3
+4784,einsum_default_354,call_function,einsum.default,backward,16,2,2,1,4300,1868,5
+4785,permute_668,call_function,permute.default,backward,16,1,1,1,4300,2,4
+4786,dtype_cast_356,call_function,dtype_cast.default,backward,16,1,1,1,4301,1,4
+4787,alias_default_1396,call_function,alias.default,backward,16,1,1,0,4302,0,3
+4788,alias_default_974,call_function,alias.default,backward,16,1,1,2,4301,1867,4
+4789,mul_424,call_function,mul.Tensor,backward,16,2,2,1,4302,1855,8
+4790,mul_425,call_function,mul.Tensor,backward,16,2,2,1,4302,1859,8
+4791,alias_default_975,call_function,alias.default,backward,16,1,1,2,4303,1854,4
+4792,einsum_default_355,call_function,einsum.default,backward,16,2,2,1,4304,3,5
+4793,permute_671,call_function,permute.default,backward,16,1,1,1,4,1850,3
+4794,einsum_default_356,call_function,einsum.default,backward,16,2,2,1,4305,1849,5
+4795,permute_672,call_function,permute.default,backward,16,1,1,1,4305,2,4
+4796,dtype_cast_357,call_function,dtype_cast.default,backward,16,1,1,1,4306,1,4
+4797,alias_default_1397,call_function,alias.default,backward,16,1,1,0,4307,0,3
+4798,convert_element_type_1188,call_function,convert_element_type.default,backward,16,1,1,1,4303,1858,6
+4799,convert_element_type_1189,call_function,convert_element_type.default,backward,16,1,1,1,1856,1868,4
+4800,alias_default_976,call_function,alias.default,backward,16,1,1,2,1857,1867,4
+4801,neg_39,call_function,neg.default,backward,16,1,1,1,1858,1866,8
+4802,exp_39,call_function,exp.default,backward,16,1,1,1,1859,1865,6
+4803,add_218,call_function,add.Tensor,backward,16,1,1,1,1860,1864,4
+4804,reciprocal_11,call_function,reciprocal.default,backward,16,1,1,1,1861,1863,4
+4805,mul_426,call_function,mul.Tensor,backward,16,1,1,1,1862,1862,6
+4806,alias_default_977,call_function,alias.default,backward,16,1,1,2,1863,1861,4
+4807,mul_427,call_function,mul.Tensor,backward,16,2,2,1,4312,1857,8
+4808,sub_34,call_function,sub.Tensor,backward,16,1,1,1,1864,1859,4
+4809,mul_428,call_function,mul.Tensor,backward,16,2,2,1,1865,1858,8
+4810,add_219,call_function,add.Tensor,backward,16,1,1,1,1866,1857,4
+4811,mul_429,call_function,mul.Tensor,backward,16,2,2,1,4316,1856,8
+4812,convert_element_type_1190,call_function,convert_element_type.default,backward,16,1,1,1,4317,1855,6
+4813,alias_default_978,call_function,alias.default,backward,16,1,1,2,4318,1854,4
+4814,einsum_default_357,call_function,einsum.default,backward,16,2,2,1,4319,3,5
+4815,permute_675,call_function,permute.default,backward,16,1,1,1,4,1850,3
+4816,einsum_default_358,call_function,einsum.default,backward,16,2,2,1,4320,1849,5
+4817,add_220,call_function,add.Tensor,unknown,,2,2,1,4325,1848,10
+4818,permute_676,call_function,permute.default,backward,16,1,1,1,4320,2,4
+4819,dtype_cast_358,call_function,dtype_cast.default,backward,16,1,1,1,4321,1,4
+4820,alias_default_1395,call_function,alias.default,backward,16,1,1,0,4322,0,3
+4821,convert_element_type_1195,call_function,convert_element_type.default,backward,16,1,1,1,4326,1847,8
+4822,convert_element_type_1196,call_function,convert_element_type.default,backward,16,1,1,1,1836,1847,4
+4823,convert_element_type_1197,call_function,convert_element_type.default,backward,16,1,1,1,3,1841,2
+4824,alias_default_979,call_function,alias.default,backward,16,1,1,2,4327,1846,4
+4825,mul_430,call_function,mul.Tensor,backward,16,2,2,1,4329,1840,8
+4826,mul_431,call_function,mul.Tensor,backward,16,2,2,1,1844,1846,8
+4827,alias_default_980,call_function,alias.default,backward,16,1,1,2,4330,1839,4
+4828,alias_default_981,call_function,alias.default,backward,16,1,1,3,1845,1845,4
+4829,mul_432,call_function,mul.Tensor,backward,16,2,2,1,4334,1838,8
+4830,sum_47,call_function,sum.dim_IntList,backward,16,1,1,1,4335,1837,5
+4831,div_51,call_function,div.Tensor,backward,16,1,1,1,1846,1837,6
+4832,mul_433,call_function,mul.Tensor,backward,16,2,2,1,4337,1836,8
+4833,sub_35,call_function,sub.Tensor,backward,16,2,2,1,4338,1835,10
+4834,mul_434,call_function,mul.Tensor,backward,16,2,2,1,4339,1834,8
+4835,mul_435,call_function,mul.Tensor,backward,16,2,2,1,4331,4,8
+4836,sum_48,call_function,sum.dim_IntList,backward,16,1,1,1,4332,3,5
+4837,convert_element_type_1198,call_function,convert_element_type.default,backward,16,1,1,1,4340,1833,6
+4838,convert_element_type_1199,call_function,convert_element_type.default,backward,16,1,1,1,4333,2,3
+4839,add_221,call_function,add.Tensor,unknown,,2,2,1,4341,1832,10
+4840,dtype_cast_359,call_function,dtype_cast.default,backward,16,1,1,1,4334,1,3
+4841,alias_default_1399,call_function,alias.default,backward,16,1,1,0,4335,0,2
+4842,alias_default_982,call_function,alias.default,unknown,,1,1,3,4342,1831,4
+4843,einsum_default_359,call_function,einsum.default,backward,16,2,2,1,4343,3,5
+4844,permute_679,call_function,permute.default,backward,16,1,1,1,4,1827,3
+4845,einsum_default_360,call_function,einsum.default,backward,16,2,2,1,4344,1826,5
+4846,permute_680,call_function,permute.default,backward,16,1,1,1,4344,2,4
+4847,dtype_cast_360,call_function,dtype_cast.default,backward,16,1,1,1,4345,1,4
+4848,alias_default_1394,call_function,alias.default,backward,16,1,1,0,4346,0,3
+4849,view_898,call_function,view.default,backward,16,1,1,1,4345,1825,4
+4850,permute_681,call_function,permute.default,backward,16,1,1,1,4346,1824,4
+4851,_scaled_dot_product_flash_attention_backward_11,call_function,_scaled_dot_product_flash_attention_backward.default,backward,16,8,8,3,4350,1823,2
+4852,getitem_285,call_function,getitem,backward,16,1,1,1,4351,1796,2
+4853,getitem_286,call_function,getitem,backward,16,1,1,1,4351,1797,2
+4854,getitem_287,call_function,getitem,backward,16,1,1,1,4351,1790,2
+4855,permute_682,call_function,permute.default,backward,16,1,1,1,4352,1789,2
+4856,permute_683,call_function,permute.default,backward,16,1,1,1,4352,1796,2
+4857,permute_684,call_function,permute.default,backward,16,1,1,1,4352,1795,2
+4858,convert_element_type_1204,call_function,convert_element_type.default,backward,16,1,1,1,4353,1795,2
+4859,convert_element_type_1205,call_function,convert_element_type.default,backward,16,1,1,1,4353,1794,2
+4860,view_899,call_function,view.default,backward,16,1,1,1,4354,1794,2
+4861,view_as_complex_78,call_function,view_as_complex.default,backward,16,1,1,1,4355,1793,6
+4862,_conj_22,call_function,_conj.default,backward,16,1,1,1,4,1794,3
+4863,clone_94,call_function,clone.default,backward,16,1,1,1,5,1793,3
+4864,mul_436,call_function,mul.Tensor,backward,16,2,2,1,4358,1792,8
+4865,view_900,call_function,view.default,backward,16,1,1,1,4354,1793,2
+4866,view_as_complex_79,call_function,view_as_complex.default,backward,16,1,1,1,4355,1792,6
+4867,_conj_23,call_function,_conj.default,backward,16,1,1,1,4,1793,3
+4868,clone_95,call_function,clone.default,backward,16,1,1,1,5,1792,3
+4869,mul_437,call_function,mul.Tensor,backward,16,2,2,1,4358,1791,8
+4870,view_as_real_78,call_function,view_as_real.default,backward,16,1,1,1,4359,1791,6
+4871,view_901,call_function,view.default,backward,16,1,1,1,4360,1790,6
+4872,convert_element_type_1206,call_function,convert_element_type.default,backward,16,1,1,1,4361,1789,6
+4873,view_as_real_79,call_function,view_as_real.default,backward,16,1,1,1,4359,1790,6
+4874,view_902,call_function,view.default,backward,16,1,1,1,4360,1789,6
+4875,convert_element_type_1207,call_function,convert_element_type.default,backward,16,1,1,1,4361,1788,6
+4876,view_903,call_function,view.default,backward,16,1,1,1,4353,1788,2
+4877,view_904,call_function,view.default,backward,16,1,1,1,4362,1788,5
+4878,view_905,call_function,view.default,backward,16,1,1,1,4362,1787,5
+4879,alias_default_983,call_function,alias.default,backward,16,1,1,2,4354,1787,4
+4880,einsum_default_361,call_function,einsum.default,backward,16,2,2,1,4355,3,5
+4881,permute_687,call_function,permute.default,backward,16,1,1,1,4,1783,3
+4882,einsum_default_362,call_function,einsum.default,backward,16,2,2,1,4356,1782,5
+4883,permute_688,call_function,permute.default,backward,16,1,1,1,4356,2,4
+4884,dtype_cast_361,call_function,dtype_cast.default,backward,16,1,1,1,4357,1,4
+4885,alias_default_1393,call_function,alias.default,backward,16,1,1,0,4358,0,3
+4886,alias_default_984,call_function,alias.default,backward,16,1,1,2,4363,1787,4
+4887,einsum_default_363,call_function,einsum.default,backward,16,2,2,1,4364,3,5
+4888,permute_691,call_function,permute.default,backward,16,1,1,1,4,1783,3
+4889,einsum_default_364,call_function,einsum.default,backward,16,2,2,1,4365,1782,5
+4890,add_222,call_function,add.Tensor,unknown,,2,2,1,4372,1781,10
+4891,permute_692,call_function,permute.default,backward,16,1,1,1,4365,2,4
+4892,dtype_cast_362,call_function,dtype_cast.default,backward,16,1,1,1,4366,1,4
+4893,alias_default_1392,call_function,alias.default,backward,16,1,1,0,4367,0,3
+4894,alias_default_985,call_function,alias.default,backward,16,1,1,2,4363,1786,4
+4895,einsum_default_365,call_function,einsum.default,backward,16,2,2,1,4364,3,5
+4896,permute_695,call_function,permute.default,backward,16,1,1,1,4,1782,3
+4897,einsum_default_366,call_function,einsum.default,backward,16,2,2,1,4365,1781,5
+4898,add_223,call_function,add.Tensor,unknown,,2,2,1,4388,1780,10
+4899,permute_696,call_function,permute.default,backward,16,1,1,1,4365,2,4
+4900,dtype_cast_363,call_function,dtype_cast.default,backward,16,1,1,1,4366,1,4
+4901,alias_default_1391,call_function,alias.default,backward,16,1,1,0,4367,0,3
+4902,convert_element_type_1220,call_function,convert_element_type.default,backward,16,1,1,1,4389,1779,8
+4903,convert_element_type_1221,call_function,convert_element_type.default,backward,16,1,1,1,1769,1779,4
+4904,convert_element_type_1222,call_function,convert_element_type.default,backward,16,1,1,1,3,1773,2
+4905,alias_default_986,call_function,alias.default,backward,16,1,1,2,4390,1778,4
+4906,mul_438,call_function,mul.Tensor,backward,16,2,2,1,4392,1772,8
+4907,mul_439,call_function,mul.Tensor,backward,16,2,2,1,1777,1778,8
+4908,alias_default_987,call_function,alias.default,backward,16,1,1,2,4393,1771,4
+4909,alias_default_988,call_function,alias.default,backward,16,1,1,3,1778,1777,4
+4910,mul_440,call_function,mul.Tensor,backward,16,2,2,1,4397,1770,8
+4911,sum_49,call_function,sum.dim_IntList,backward,16,1,1,1,4398,1769,5
+4912,div_52,call_function,div.Tensor,backward,16,1,1,1,1779,1769,6
+4913,mul_441,call_function,mul.Tensor,backward,16,2,2,1,4400,1768,8
+4914,sub_36,call_function,sub.Tensor,backward,16,2,2,1,4401,1767,10
+4915,mul_442,call_function,mul.Tensor,backward,16,2,2,1,4402,1766,8
+4916,mul_443,call_function,mul.Tensor,backward,16,2,2,1,4394,4,8
+4917,sum_50,call_function,sum.dim_IntList,backward,16,1,1,1,4395,3,5
+4918,convert_element_type_1223,call_function,convert_element_type.default,backward,16,1,1,1,4403,1765,6
+4919,convert_element_type_1224,call_function,convert_element_type.default,backward,16,1,1,1,4396,2,3
+4920,add_224,call_function,add.Tensor,unknown,,2,2,1,4404,1764,10
+4921,dtype_cast_364,call_function,dtype_cast.default,backward,16,1,1,1,4397,1,3
+4922,alias_default_1398,call_function,alias.default,backward,16,1,1,0,4398,0,2
+4923,alias_default_989,call_function,alias.default,unknown,,1,1,3,4405,1763,4
+4924,einsum_default_367,call_function,einsum.default,backward,15,2,2,1,4406,3,5
+4925,permute_699,call_function,permute.default,backward,15,1,1,1,4,1759,3
+4926,einsum_default_368,call_function,einsum.default,backward,15,2,2,1,4407,1758,5
+4927,permute_700,call_function,permute.default,backward,15,1,1,1,4407,2,4
+4928,dtype_cast_365,call_function,dtype_cast.default,backward,15,1,1,1,4408,1,4
+4929,alias_default_1387,call_function,alias.default,backward,15,1,1,0,4409,0,3
+4930,alias_default_990,call_function,alias.default,backward,15,1,1,2,4408,1757,4
+4931,mul_444,call_function,mul.Tensor,backward,15,2,2,1,4409,1745,8
+4932,mul_445,call_function,mul.Tensor,backward,15,2,2,1,4409,1749,8
+4933,alias_default_991,call_function,alias.default,backward,15,1,1,2,4410,1744,4
+4934,einsum_default_369,call_function,einsum.default,backward,15,2,2,1,4411,3,5
+4935,permute_703,call_function,permute.default,backward,15,1,1,1,4,1740,3
+4936,einsum_default_370,call_function,einsum.default,backward,15,2,2,1,4412,1739,5
+4937,permute_704,call_function,permute.default,backward,15,1,1,1,4412,2,4
+4938,dtype_cast_366,call_function,dtype_cast.default,backward,15,1,1,1,4413,1,4
+4939,alias_default_1388,call_function,alias.default,backward,15,1,1,0,4414,0,3
+4940,convert_element_type_1233,call_function,convert_element_type.default,backward,15,1,1,1,4410,1748,6
+4941,convert_element_type_1234,call_function,convert_element_type.default,backward,15,1,1,1,1746,1758,4
+4942,alias_default_992,call_function,alias.default,backward,15,1,1,2,1747,1757,4
+4943,neg_40,call_function,neg.default,backward,15,1,1,1,1748,1756,8
+4944,exp_40,call_function,exp.default,backward,15,1,1,1,1749,1755,6
+4945,add_225,call_function,add.Tensor,backward,15,1,1,1,1750,1754,4
+4946,reciprocal_12,call_function,reciprocal.default,backward,15,1,1,1,1751,1753,4
+4947,mul_446,call_function,mul.Tensor,backward,15,1,1,1,1752,1752,6
+4948,alias_default_993,call_function,alias.default,backward,15,1,1,2,1753,1751,4
+4949,mul_447,call_function,mul.Tensor,backward,15,2,2,1,4419,1747,8
+4950,sub_37,call_function,sub.Tensor,backward,15,1,1,1,1754,1749,4
+4951,mul_448,call_function,mul.Tensor,backward,15,2,2,1,1755,1748,8
+4952,add_226,call_function,add.Tensor,backward,15,1,1,1,1756,1747,4
+4953,mul_449,call_function,mul.Tensor,backward,15,2,2,1,4423,1746,8
+4954,convert_element_type_1235,call_function,convert_element_type.default,backward,15,1,1,1,4424,1745,6
+4955,alias_default_994,call_function,alias.default,backward,15,1,1,2,4425,1744,4
+4956,einsum_default_371,call_function,einsum.default,backward,15,2,2,1,4426,3,5
+4957,permute_707,call_function,permute.default,backward,15,1,1,1,4,1740,3
+4958,einsum_default_372,call_function,einsum.default,backward,15,2,2,1,4427,1739,5
+4959,add_227,call_function,add.Tensor,unknown,,2,2,1,4432,1738,10
+4960,permute_708,call_function,permute.default,backward,15,1,1,1,4427,2,4
+4961,dtype_cast_367,call_function,dtype_cast.default,backward,15,1,1,1,4428,1,4
+4962,alias_default_1386,call_function,alias.default,backward,15,1,1,0,4429,0,3
+4963,convert_element_type_1240,call_function,convert_element_type.default,backward,15,1,1,1,4433,1737,8
+4964,convert_element_type_1241,call_function,convert_element_type.default,backward,15,1,1,1,1726,1737,4
+4965,convert_element_type_1242,call_function,convert_element_type.default,backward,15,1,1,1,3,1731,2
+4966,alias_default_995,call_function,alias.default,backward,15,1,1,2,4434,1736,4
+4967,mul_450,call_function,mul.Tensor,backward,15,2,2,1,4436,1730,8
+4968,mul_451,call_function,mul.Tensor,backward,15,2,2,1,1734,1736,8
+4969,alias_default_996,call_function,alias.default,backward,15,1,1,2,4437,1729,4
+4970,alias_default_997,call_function,alias.default,backward,15,1,1,3,1735,1735,4
+4971,mul_452,call_function,mul.Tensor,backward,15,2,2,1,4441,1728,8
+4972,sum_51,call_function,sum.dim_IntList,backward,15,1,1,1,4442,1727,5
+4973,div_53,call_function,div.Tensor,backward,15,1,1,1,1736,1727,6
+4974,mul_453,call_function,mul.Tensor,backward,15,2,2,1,4444,1726,8
+4975,sub_38,call_function,sub.Tensor,backward,15,2,2,1,4445,1725,10
+4976,mul_454,call_function,mul.Tensor,backward,15,2,2,1,4446,1724,8
+4977,mul_455,call_function,mul.Tensor,backward,15,2,2,1,4438,4,8
+4978,sum_52,call_function,sum.dim_IntList,backward,15,1,1,1,4439,3,5
+4979,convert_element_type_1243,call_function,convert_element_type.default,backward,15,1,1,1,4447,1723,6
+4980,convert_element_type_1244,call_function,convert_element_type.default,backward,15,1,1,1,4440,2,3
+4981,add_228,call_function,add.Tensor,unknown,,2,2,1,4448,1722,10
+4982,dtype_cast_368,call_function,dtype_cast.default,backward,15,1,1,1,4441,1,3
+4983,alias_default_1390,call_function,alias.default,backward,15,1,1,0,4442,0,2
+4984,alias_default_998,call_function,alias.default,unknown,,1,1,3,4449,1721,4
+4985,einsum_default_373,call_function,einsum.default,backward,15,2,2,1,4450,3,5
+4986,permute_711,call_function,permute.default,backward,15,1,1,1,4,1717,3
+4987,einsum_default_374,call_function,einsum.default,backward,15,2,2,1,4451,1716,5
+4988,permute_712,call_function,permute.default,backward,15,1,1,1,4451,2,4
+4989,dtype_cast_369,call_function,dtype_cast.default,backward,15,1,1,1,4452,1,4
+4990,alias_default_1385,call_function,alias.default,backward,15,1,1,0,4453,0,3
+4991,view_920,call_function,view.default,backward,15,1,1,1,4452,1715,4
+4992,permute_713,call_function,permute.default,backward,15,1,1,1,4453,1714,4
+4993,_scaled_dot_product_flash_attention_backward_12,call_function,_scaled_dot_product_flash_attention_backward.default,backward,15,8,8,3,4457,1713,2
+4994,getitem_288,call_function,getitem,backward,15,1,1,1,4458,1686,2
+4995,getitem_289,call_function,getitem,backward,15,1,1,1,4458,1687,2
+4996,getitem_290,call_function,getitem,backward,15,1,1,1,4458,1680,2
+4997,permute_714,call_function,permute.default,backward,15,1,1,1,4459,1679,2
+4998,permute_715,call_function,permute.default,backward,15,1,1,1,4459,1686,2
+4999,permute_716,call_function,permute.default,backward,15,1,1,1,4459,1685,2
+5000,convert_element_type_1249,call_function,convert_element_type.default,backward,15,1,1,1,4460,1685,2
+5001,convert_element_type_1250,call_function,convert_element_type.default,backward,15,1,1,1,4460,1684,2
+5002,view_921,call_function,view.default,backward,15,1,1,1,4461,1684,2
+5003,view_as_complex_80,call_function,view_as_complex.default,backward,15,1,1,1,4462,1683,6
+5004,_conj_24,call_function,_conj.default,backward,15,1,1,1,4,1684,3
+5005,clone_102,call_function,clone.default,backward,15,1,1,1,5,1683,3
+5006,mul_456,call_function,mul.Tensor,backward,15,2,2,1,4465,1682,8
+5007,view_922,call_function,view.default,backward,15,1,1,1,4461,1683,2
+5008,view_as_complex_81,call_function,view_as_complex.default,backward,15,1,1,1,4462,1682,6
+5009,_conj_25,call_function,_conj.default,backward,15,1,1,1,4,1683,3
+5010,clone_103,call_function,clone.default,backward,15,1,1,1,5,1682,3
+5011,mul_457,call_function,mul.Tensor,backward,15,2,2,1,4465,1681,8
+5012,view_as_real_80,call_function,view_as_real.default,backward,15,1,1,1,4466,1681,6
+5013,view_923,call_function,view.default,backward,15,1,1,1,4467,1680,6
+5014,convert_element_type_1251,call_function,convert_element_type.default,backward,15,1,1,1,4468,1679,6
+5015,view_as_real_81,call_function,view_as_real.default,backward,15,1,1,1,4466,1680,6
+5016,view_924,call_function,view.default,backward,15,1,1,1,4467,1679,6
+5017,convert_element_type_1252,call_function,convert_element_type.default,backward,15,1,1,1,4468,1678,6
+5018,view_925,call_function,view.default,backward,15,1,1,1,4460,1678,2
+5019,view_926,call_function,view.default,backward,15,1,1,1,4469,1678,5
+5020,view_927,call_function,view.default,backward,15,1,1,1,4469,1677,5
+5021,alias_default_999,call_function,alias.default,backward,15,1,1,2,4461,1677,4
+5022,einsum_default_375,call_function,einsum.default,backward,15,2,2,1,4462,3,5
+5023,permute_719,call_function,permute.default,backward,15,1,1,1,4,1673,3
+5024,einsum_default_376,call_function,einsum.default,backward,15,2,2,1,4463,1672,5
+5025,permute_720,call_function,permute.default,backward,15,1,1,1,4463,2,4
+5026,dtype_cast_370,call_function,dtype_cast.default,backward,15,1,1,1,4464,1,4
+5027,alias_default_1384,call_function,alias.default,backward,15,1,1,0,4465,0,3
+5028,alias_default_1000,call_function,alias.default,backward,15,1,1,2,4470,1677,4
+5029,einsum_default_377,call_function,einsum.default,backward,15,2,2,1,4471,3,5
+5030,permute_723,call_function,permute.default,backward,15,1,1,1,4,1673,3
+5031,einsum_default_378,call_function,einsum.default,backward,15,2,2,1,4472,1672,5
+5032,add_229,call_function,add.Tensor,unknown,,2,2,1,4479,1671,10
+5033,permute_724,call_function,permute.default,backward,15,1,1,1,4472,2,4
+5034,dtype_cast_371,call_function,dtype_cast.default,backward,15,1,1,1,4473,1,4
+5035,alias_default_1383,call_function,alias.default,backward,15,1,1,0,4474,0,3
+5036,alias_default_1001,call_function,alias.default,backward,15,1,1,2,4470,1676,4
+5037,einsum_default_379,call_function,einsum.default,backward,15,2,2,1,4471,3,5
+5038,permute_727,call_function,permute.default,backward,15,1,1,1,4,1672,3
+5039,einsum_default_380,call_function,einsum.default,backward,15,2,2,1,4472,1671,5
+5040,add_230,call_function,add.Tensor,unknown,,2,2,1,4495,1670,10
+5041,permute_728,call_function,permute.default,backward,15,1,1,1,4472,2,4
+5042,dtype_cast_372,call_function,dtype_cast.default,backward,15,1,1,1,4473,1,4
+5043,alias_default_1382,call_function,alias.default,backward,15,1,1,0,4474,0,3
+5044,convert_element_type_1265,call_function,convert_element_type.default,backward,15,1,1,1,4496,1669,8
+5045,convert_element_type_1266,call_function,convert_element_type.default,backward,15,1,1,1,1659,1669,4
+5046,convert_element_type_1267,call_function,convert_element_type.default,backward,15,1,1,1,3,1663,2
+5047,alias_default_1002,call_function,alias.default,backward,15,1,1,2,4497,1668,4
+5048,mul_458,call_function,mul.Tensor,backward,15,2,2,1,4499,1662,8
+5049,mul_459,call_function,mul.Tensor,backward,15,2,2,1,1667,1668,8
+5050,alias_default_1003,call_function,alias.default,backward,15,1,1,2,4500,1661,4
+5051,alias_default_1004,call_function,alias.default,backward,15,1,1,3,1668,1667,4
+5052,mul_460,call_function,mul.Tensor,backward,15,2,2,1,4504,1660,8
+5053,sum_53,call_function,sum.dim_IntList,backward,15,1,1,1,4505,1659,5
+5054,div_54,call_function,div.Tensor,backward,15,1,1,1,1669,1659,6
+5055,mul_461,call_function,mul.Tensor,backward,15,2,2,1,4507,1658,8
+5056,sub_39,call_function,sub.Tensor,backward,15,2,2,1,4508,1657,10
+5057,mul_462,call_function,mul.Tensor,backward,15,2,2,1,4509,1656,8
+5058,mul_463,call_function,mul.Tensor,backward,15,2,2,1,4501,4,8
+5059,sum_54,call_function,sum.dim_IntList,backward,15,1,1,1,4502,3,5
+5060,convert_element_type_1268,call_function,convert_element_type.default,backward,15,1,1,1,4510,1655,6
+5061,convert_element_type_1269,call_function,convert_element_type.default,backward,15,1,1,1,4503,2,3
+5062,add_231,call_function,add.Tensor,unknown,,2,2,1,4511,1654,10
+5063,dtype_cast_373,call_function,dtype_cast.default,backward,15,1,1,1,4504,1,3
+5064,alias_default_1389,call_function,alias.default,backward,15,1,1,0,4505,0,2
+5065,alias_default_1005,call_function,alias.default,unknown,,1,1,3,4512,1653,4
+5066,einsum_default_381,call_function,einsum.default,backward,14,2,2,1,4513,3,5
+5067,permute_731,call_function,permute.default,backward,14,1,1,1,4,1649,3
+5068,einsum_default_382,call_function,einsum.default,backward,14,2,2,1,4514,1648,5
+5069,permute_732,call_function,permute.default,backward,14,1,1,1,4514,2,4
+5070,dtype_cast_374,call_function,dtype_cast.default,backward,14,1,1,1,4515,1,4
+5071,alias_default_1378,call_function,alias.default,backward,14,1,1,0,4516,0,3
+5072,alias_default_1006,call_function,alias.default,backward,14,1,1,2,4515,1647,4
+5073,mul_464,call_function,mul.Tensor,backward,14,2,2,1,4516,1635,8
+5074,mul_465,call_function,mul.Tensor,backward,14,2,2,1,4516,1639,8
+5075,alias_default_1007,call_function,alias.default,backward,14,1,1,2,4517,1634,4
+5076,einsum_default_383,call_function,einsum.default,backward,14,2,2,1,4518,3,5
+5077,permute_735,call_function,permute.default,backward,14,1,1,1,4,1630,3
+5078,einsum_default_384,call_function,einsum.default,backward,14,2,2,1,4519,1629,5
+5079,permute_736,call_function,permute.default,backward,14,1,1,1,4519,2,4
+5080,dtype_cast_375,call_function,dtype_cast.default,backward,14,1,1,1,4520,1,4
+5081,alias_default_1379,call_function,alias.default,backward,14,1,1,0,4521,0,3
+5082,convert_element_type_1278,call_function,convert_element_type.default,backward,14,1,1,1,4517,1638,6
+5083,convert_element_type_1279,call_function,convert_element_type.default,backward,14,1,1,1,1636,1648,4
+5084,alias_default_1008,call_function,alias.default,backward,14,1,1,2,1637,1647,4
+5085,neg_41,call_function,neg.default,backward,14,1,1,1,1638,1646,8
+5086,exp_41,call_function,exp.default,backward,14,1,1,1,1639,1645,6
+5087,add_232,call_function,add.Tensor,backward,14,1,1,1,1640,1644,4
+5088,reciprocal_13,call_function,reciprocal.default,backward,14,1,1,1,1641,1643,4
+5089,mul_466,call_function,mul.Tensor,backward,14,1,1,1,1642,1642,6
+5090,alias_default_1009,call_function,alias.default,backward,14,1,1,2,1643,1641,4
+5091,mul_467,call_function,mul.Tensor,backward,14,2,2,1,4526,1637,8
+5092,sub_40,call_function,sub.Tensor,backward,14,1,1,1,1644,1639,4
+5093,mul_468,call_function,mul.Tensor,backward,14,2,2,1,1645,1638,8
+5094,add_233,call_function,add.Tensor,backward,14,1,1,1,1646,1637,4
+5095,mul_469,call_function,mul.Tensor,backward,14,2,2,1,4530,1636,8
+5096,convert_element_type_1280,call_function,convert_element_type.default,backward,14,1,1,1,4531,1635,6
+5097,alias_default_1010,call_function,alias.default,backward,14,1,1,2,4532,1634,4
+5098,einsum_default_385,call_function,einsum.default,backward,14,2,2,1,4533,3,5
+5099,permute_739,call_function,permute.default,backward,14,1,1,1,4,1630,3
+5100,einsum_default_386,call_function,einsum.default,backward,14,2,2,1,4534,1629,5
+5101,add_234,call_function,add.Tensor,unknown,,2,2,1,4539,1628,10
+5102,permute_740,call_function,permute.default,backward,14,1,1,1,4534,2,4
+5103,dtype_cast_376,call_function,dtype_cast.default,backward,14,1,1,1,4535,1,4
+5104,alias_default_1377,call_function,alias.default,backward,14,1,1,0,4536,0,3
+5105,convert_element_type_1285,call_function,convert_element_type.default,backward,14,1,1,1,4540,1627,8
+5106,convert_element_type_1286,call_function,convert_element_type.default,backward,14,1,1,1,1616,1627,4
+5107,convert_element_type_1287,call_function,convert_element_type.default,backward,14,1,1,1,3,1621,2
+5108,alias_default_1011,call_function,alias.default,backward,14,1,1,2,4541,1626,4
+5109,mul_470,call_function,mul.Tensor,backward,14,2,2,1,4543,1620,8
+5110,mul_471,call_function,mul.Tensor,backward,14,2,2,1,1624,1626,8
+5111,alias_default_1012,call_function,alias.default,backward,14,1,1,2,4544,1619,4
+5112,alias_default_1013,call_function,alias.default,backward,14,1,1,3,1625,1625,4
+5113,mul_472,call_function,mul.Tensor,backward,14,2,2,1,4548,1618,8
+5114,sum_55,call_function,sum.dim_IntList,backward,14,1,1,1,4549,1617,5
+5115,div_55,call_function,div.Tensor,backward,14,1,1,1,1626,1617,6
+5116,mul_473,call_function,mul.Tensor,backward,14,2,2,1,4551,1616,8
+5117,sub_41,call_function,sub.Tensor,backward,14,2,2,1,4552,1615,10
+5118,mul_474,call_function,mul.Tensor,backward,14,2,2,1,4553,1614,8
+5119,mul_475,call_function,mul.Tensor,backward,14,2,2,1,4545,4,8
+5120,sum_56,call_function,sum.dim_IntList,backward,14,1,1,1,4546,3,5
+5121,convert_element_type_1288,call_function,convert_element_type.default,backward,14,1,1,1,4554,1613,6
+5122,convert_element_type_1289,call_function,convert_element_type.default,backward,14,1,1,1,4547,2,3
+5123,add_235,call_function,add.Tensor,unknown,,2,2,1,4555,1612,10
+5124,dtype_cast_377,call_function,dtype_cast.default,backward,14,1,1,1,4548,1,3
+5125,alias_default_1381,call_function,alias.default,backward,14,1,1,0,4549,0,2
+5126,alias_default_1014,call_function,alias.default,unknown,,1,1,3,4556,1611,4
+5127,einsum_default_387,call_function,einsum.default,backward,14,2,2,1,4557,3,5
+5128,permute_743,call_function,permute.default,backward,14,1,1,1,4,1607,3
+5129,einsum_default_388,call_function,einsum.default,backward,14,2,2,1,4558,1606,5
+5130,permute_744,call_function,permute.default,backward,14,1,1,1,4558,2,4
+5131,dtype_cast_378,call_function,dtype_cast.default,backward,14,1,1,1,4559,1,4
+5132,alias_default_1376,call_function,alias.default,backward,14,1,1,0,4560,0,3
+5133,view_942,call_function,view.default,backward,14,1,1,1,4559,1605,4
+5134,permute_745,call_function,permute.default,backward,14,1,1,1,4560,1604,4
+5135,_scaled_dot_product_flash_attention_backward_13,call_function,_scaled_dot_product_flash_attention_backward.default,backward,14,8,8,3,4564,1603,2
+5136,getitem_291,call_function,getitem,backward,14,1,1,1,4565,1576,2
+5137,getitem_292,call_function,getitem,backward,14,1,1,1,4565,1577,2
+5138,getitem_293,call_function,getitem,backward,14,1,1,1,4565,1570,2
+5139,permute_746,call_function,permute.default,backward,14,1,1,1,4566,1569,2
+5140,permute_747,call_function,permute.default,backward,14,1,1,1,4566,1576,2
+5141,permute_748,call_function,permute.default,backward,14,1,1,1,4566,1575,2
+5142,convert_element_type_1294,call_function,convert_element_type.default,backward,14,1,1,1,4567,1575,2
+5143,convert_element_type_1295,call_function,convert_element_type.default,backward,14,1,1,1,4567,1574,2
+5144,view_943,call_function,view.default,backward,14,1,1,1,4568,1574,2
+5145,view_as_complex_82,call_function,view_as_complex.default,backward,14,1,1,1,4569,1573,6
+5146,_conj_26,call_function,_conj.default,backward,14,1,1,1,4,1574,3
+5147,clone_110,call_function,clone.default,backward,14,1,1,1,5,1573,3
+5148,mul_476,call_function,mul.Tensor,backward,14,2,2,1,4572,1572,8
+5149,view_944,call_function,view.default,backward,14,1,1,1,4568,1573,2
+5150,view_as_complex_83,call_function,view_as_complex.default,backward,14,1,1,1,4569,1572,6
+5151,_conj_27,call_function,_conj.default,backward,14,1,1,1,4,1573,3
+5152,clone_111,call_function,clone.default,backward,14,1,1,1,5,1572,3
+5153,mul_477,call_function,mul.Tensor,backward,14,2,2,1,4572,1571,8
+5154,view_as_real_82,call_function,view_as_real.default,backward,14,1,1,1,4573,1571,6
+5155,view_945,call_function,view.default,backward,14,1,1,1,4574,1570,6
+5156,convert_element_type_1296,call_function,convert_element_type.default,backward,14,1,1,1,4575,1569,6
+5157,view_as_real_83,call_function,view_as_real.default,backward,14,1,1,1,4573,1570,6
+5158,view_946,call_function,view.default,backward,14,1,1,1,4574,1569,6
+5159,convert_element_type_1297,call_function,convert_element_type.default,backward,14,1,1,1,4575,1568,6
+5160,view_947,call_function,view.default,backward,14,1,1,1,4567,1568,2
+5161,view_948,call_function,view.default,backward,14,1,1,1,4576,1568,5
+5162,view_949,call_function,view.default,backward,14,1,1,1,4576,1567,5
+5163,alias_default_1015,call_function,alias.default,backward,14,1,1,2,4568,1567,4
+5164,einsum_default_389,call_function,einsum.default,backward,14,2,2,1,4569,3,5
+5165,permute_751,call_function,permute.default,backward,14,1,1,1,4,1563,3
+5166,einsum_default_390,call_function,einsum.default,backward,14,2,2,1,4570,1562,5
+5167,permute_752,call_function,permute.default,backward,14,1,1,1,4570,2,4
+5168,dtype_cast_379,call_function,dtype_cast.default,backward,14,1,1,1,4571,1,4
+5169,alias_default_1375,call_function,alias.default,backward,14,1,1,0,4572,0,3
+5170,alias_default_1016,call_function,alias.default,backward,14,1,1,2,4577,1567,4
+5171,einsum_default_391,call_function,einsum.default,backward,14,2,2,1,4578,3,5
+5172,permute_755,call_function,permute.default,backward,14,1,1,1,4,1563,3
+5173,einsum_default_392,call_function,einsum.default,backward,14,2,2,1,4579,1562,5
+5174,add_236,call_function,add.Tensor,unknown,,2,2,1,4586,1561,10
+5175,permute_756,call_function,permute.default,backward,14,1,1,1,4579,2,4
+5176,dtype_cast_380,call_function,dtype_cast.default,backward,14,1,1,1,4580,1,4
+5177,alias_default_1374,call_function,alias.default,backward,14,1,1,0,4581,0,3
+5178,alias_default_1017,call_function,alias.default,backward,14,1,1,2,4577,1566,4
+5179,einsum_default_393,call_function,einsum.default,backward,14,2,2,1,4578,3,5
+5180,permute_759,call_function,permute.default,backward,14,1,1,1,4,1562,3
+5181,einsum_default_394,call_function,einsum.default,backward,14,2,2,1,4579,1561,5
+5182,add_237,call_function,add.Tensor,unknown,,2,2,1,4602,1560,10
+5183,permute_760,call_function,permute.default,backward,14,1,1,1,4579,2,4
+5184,dtype_cast_381,call_function,dtype_cast.default,backward,14,1,1,1,4580,1,4
+5185,alias_default_1373,call_function,alias.default,backward,14,1,1,0,4581,0,3
+5186,convert_element_type_1310,call_function,convert_element_type.default,backward,14,1,1,1,4603,1559,8
+5187,convert_element_type_1311,call_function,convert_element_type.default,backward,14,1,1,1,1549,1559,4
+5188,convert_element_type_1312,call_function,convert_element_type.default,backward,14,1,1,1,3,1553,2
+5189,alias_default_1018,call_function,alias.default,backward,14,1,1,2,4604,1558,4
+5190,mul_478,call_function,mul.Tensor,backward,14,2,2,1,4606,1552,8
+5191,mul_479,call_function,mul.Tensor,backward,14,2,2,1,1557,1558,8
+5192,alias_default_1019,call_function,alias.default,backward,14,1,1,2,4607,1551,4
+5193,alias_default_1020,call_function,alias.default,backward,14,1,1,3,1558,1557,4
+5194,mul_480,call_function,mul.Tensor,backward,14,2,2,1,4611,1550,8
+5195,sum_57,call_function,sum.dim_IntList,backward,14,1,1,1,4612,1549,5
+5196,div_56,call_function,div.Tensor,backward,14,1,1,1,1559,1549,6
+5197,mul_481,call_function,mul.Tensor,backward,14,2,2,1,4614,1548,8
+5198,sub_42,call_function,sub.Tensor,backward,14,2,2,1,4615,1547,10
+5199,mul_482,call_function,mul.Tensor,backward,14,2,2,1,4616,1546,8
+5200,mul_483,call_function,mul.Tensor,backward,14,2,2,1,4608,4,8
+5201,sum_58,call_function,sum.dim_IntList,backward,14,1,1,1,4609,3,5
+5202,convert_element_type_1313,call_function,convert_element_type.default,backward,14,1,1,1,4617,1545,6
+5203,convert_element_type_1314,call_function,convert_element_type.default,backward,14,1,1,1,4610,2,3
+5204,add_238,call_function,add.Tensor,unknown,,2,2,1,4618,1544,10
+5205,dtype_cast_382,call_function,dtype_cast.default,backward,14,1,1,1,4611,1,3
+5206,alias_default_1380,call_function,alias.default,backward,14,1,1,0,4612,0,2
+5207,alias_default_1021,call_function,alias.default,unknown,,1,1,3,4619,1543,4
+5208,einsum_default_395,call_function,einsum.default,backward,13,2,2,1,4620,3,5
+5209,permute_763,call_function,permute.default,backward,13,1,1,1,4,1539,3
+5210,einsum_default_396,call_function,einsum.default,backward,13,2,2,1,4621,1538,5
+5211,permute_764,call_function,permute.default,backward,13,1,1,1,4621,2,4
+5212,dtype_cast_383,call_function,dtype_cast.default,backward,13,1,1,1,4622,1,4
+5213,alias_default_1369,call_function,alias.default,backward,13,1,1,0,4623,0,3
+5214,alias_default_1022,call_function,alias.default,backward,13,1,1,2,4622,1537,4
+5215,mul_484,call_function,mul.Tensor,backward,13,2,2,1,4623,1525,8
+5216,mul_485,call_function,mul.Tensor,backward,13,2,2,1,4623,1529,8
+5217,alias_default_1023,call_function,alias.default,backward,13,1,1,2,4624,1524,4
+5218,einsum_default_397,call_function,einsum.default,backward,13,2,2,1,4625,3,5
+5219,permute_767,call_function,permute.default,backward,13,1,1,1,4,1520,3
+5220,einsum_default_398,call_function,einsum.default,backward,13,2,2,1,4626,1519,5
+5221,permute_768,call_function,permute.default,backward,13,1,1,1,4626,2,4
+5222,dtype_cast_384,call_function,dtype_cast.default,backward,13,1,1,1,4627,1,4
+5223,alias_default_1370,call_function,alias.default,backward,13,1,1,0,4628,0,3
+5224,convert_element_type_1323,call_function,convert_element_type.default,backward,13,1,1,1,4624,1528,6
+5225,convert_element_type_1324,call_function,convert_element_type.default,backward,13,1,1,1,1526,1538,4
+5226,alias_default_1024,call_function,alias.default,backward,13,1,1,2,1527,1537,4
+5227,neg_42,call_function,neg.default,backward,13,1,1,1,1528,1536,8
+5228,exp_42,call_function,exp.default,backward,13,1,1,1,1529,1535,6
+5229,add_239,call_function,add.Tensor,backward,13,1,1,1,1530,1534,4
+5230,reciprocal_14,call_function,reciprocal.default,backward,13,1,1,1,1531,1533,4
+5231,mul_486,call_function,mul.Tensor,backward,13,1,1,1,1532,1532,6
+5232,alias_default_1025,call_function,alias.default,backward,13,1,1,2,1533,1531,4
+5233,mul_487,call_function,mul.Tensor,backward,13,2,2,1,4633,1527,8
+5234,sub_43,call_function,sub.Tensor,backward,13,1,1,1,1534,1529,4
+5235,mul_488,call_function,mul.Tensor,backward,13,2,2,1,1535,1528,8
+5236,add_240,call_function,add.Tensor,backward,13,1,1,1,1536,1527,4
+5237,mul_489,call_function,mul.Tensor,backward,13,2,2,1,4637,1526,8
+5238,convert_element_type_1325,call_function,convert_element_type.default,backward,13,1,1,1,4638,1525,6
+5239,alias_default_1026,call_function,alias.default,backward,13,1,1,2,4639,1524,4
+5240,einsum_default_399,call_function,einsum.default,backward,13,2,2,1,4640,3,5
+5241,permute_771,call_function,permute.default,backward,13,1,1,1,4,1520,3
+5242,einsum_default_400,call_function,einsum.default,backward,13,2,2,1,4641,1519,5
+5243,add_241,call_function,add.Tensor,unknown,,2,2,1,4646,1518,10
+5244,permute_772,call_function,permute.default,backward,13,1,1,1,4641,2,4
+5245,dtype_cast_385,call_function,dtype_cast.default,backward,13,1,1,1,4642,1,4
+5246,alias_default_1368,call_function,alias.default,backward,13,1,1,0,4643,0,3
+5247,convert_element_type_1330,call_function,convert_element_type.default,backward,13,1,1,1,4647,1517,8
+5248,convert_element_type_1331,call_function,convert_element_type.default,backward,13,1,1,1,1506,1517,4
+5249,convert_element_type_1332,call_function,convert_element_type.default,backward,13,1,1,1,3,1511,2
+5250,alias_default_1027,call_function,alias.default,backward,13,1,1,2,4648,1516,4
+5251,mul_490,call_function,mul.Tensor,backward,13,2,2,1,4650,1510,8
+5252,mul_491,call_function,mul.Tensor,backward,13,2,2,1,1514,1516,8
+5253,alias_default_1028,call_function,alias.default,backward,13,1,1,2,4651,1509,4
+5254,alias_default_1029,call_function,alias.default,backward,13,1,1,3,1515,1515,4
+5255,mul_492,call_function,mul.Tensor,backward,13,2,2,1,4655,1508,8
+5256,sum_59,call_function,sum.dim_IntList,backward,13,1,1,1,4656,1507,5
+5257,div_57,call_function,div.Tensor,backward,13,1,1,1,1516,1507,6
+5258,mul_493,call_function,mul.Tensor,backward,13,2,2,1,4658,1506,8
+5259,sub_44,call_function,sub.Tensor,backward,13,2,2,1,4659,1505,10
+5260,mul_494,call_function,mul.Tensor,backward,13,2,2,1,4660,1504,8
+5261,mul_495,call_function,mul.Tensor,backward,13,2,2,1,4652,4,8
+5262,sum_60,call_function,sum.dim_IntList,backward,13,1,1,1,4653,3,5
+5263,convert_element_type_1333,call_function,convert_element_type.default,backward,13,1,1,1,4661,1503,6
+5264,convert_element_type_1334,call_function,convert_element_type.default,backward,13,1,1,1,4654,2,3
+5265,add_242,call_function,add.Tensor,unknown,,2,2,1,4662,1502,10
+5266,dtype_cast_386,call_function,dtype_cast.default,backward,13,1,1,1,4655,1,3
+5267,alias_default_1372,call_function,alias.default,backward,13,1,1,0,4656,0,2
+5268,alias_default_1030,call_function,alias.default,unknown,,1,1,3,4663,1501,4
+5269,einsum_default_401,call_function,einsum.default,backward,13,2,2,1,4664,3,5
+5270,permute_775,call_function,permute.default,backward,13,1,1,1,4,1497,3
+5271,einsum_default_402,call_function,einsum.default,backward,13,2,2,1,4665,1496,5
+5272,permute_776,call_function,permute.default,backward,13,1,1,1,4665,2,4
+5273,dtype_cast_387,call_function,dtype_cast.default,backward,13,1,1,1,4666,1,4
+5274,alias_default_1367,call_function,alias.default,backward,13,1,1,0,4667,0,3
+5275,view_964,call_function,view.default,backward,13,1,1,1,4666,1495,4
+5276,permute_777,call_function,permute.default,backward,13,1,1,1,4667,1494,4
+5277,_scaled_dot_product_flash_attention_backward_14,call_function,_scaled_dot_product_flash_attention_backward.default,backward,13,8,8,3,4671,1493,2
+5278,getitem_294,call_function,getitem,backward,13,1,1,1,4672,1466,2
+5279,getitem_295,call_function,getitem,backward,13,1,1,1,4672,1467,2
+5280,getitem_296,call_function,getitem,backward,13,1,1,1,4672,1460,2
+5281,permute_778,call_function,permute.default,backward,13,1,1,1,4673,1459,2
+5282,permute_779,call_function,permute.default,backward,13,1,1,1,4673,1466,2
+5283,permute_780,call_function,permute.default,backward,13,1,1,1,4673,1465,2
+5284,convert_element_type_1339,call_function,convert_element_type.default,backward,13,1,1,1,4674,1465,2
+5285,convert_element_type_1340,call_function,convert_element_type.default,backward,13,1,1,1,4674,1464,2
+5286,view_965,call_function,view.default,backward,13,1,1,1,4675,1464,2
+5287,view_as_complex_84,call_function,view_as_complex.default,backward,13,1,1,1,4676,1463,6
+5288,_conj_28,call_function,_conj.default,backward,13,1,1,1,4,1464,3
+5289,clone_118,call_function,clone.default,backward,13,1,1,1,5,1463,3
+5290,mul_496,call_function,mul.Tensor,backward,13,2,2,1,4679,1462,8
+5291,view_966,call_function,view.default,backward,13,1,1,1,4675,1463,2
+5292,view_as_complex_85,call_function,view_as_complex.default,backward,13,1,1,1,4676,1462,6
+5293,_conj_29,call_function,_conj.default,backward,13,1,1,1,4,1463,3
+5294,clone_119,call_function,clone.default,backward,13,1,1,1,5,1462,3
+5295,mul_497,call_function,mul.Tensor,backward,13,2,2,1,4679,1461,8
+5296,view_as_real_84,call_function,view_as_real.default,backward,13,1,1,1,4680,1461,6
+5297,view_967,call_function,view.default,backward,13,1,1,1,4681,1460,6
+5298,convert_element_type_1341,call_function,convert_element_type.default,backward,13,1,1,1,4682,1459,6
+5299,view_as_real_85,call_function,view_as_real.default,backward,13,1,1,1,4680,1460,6
+5300,view_968,call_function,view.default,backward,13,1,1,1,4681,1459,6
+5301,convert_element_type_1342,call_function,convert_element_type.default,backward,13,1,1,1,4682,1458,6
+5302,view_969,call_function,view.default,backward,13,1,1,1,4674,1458,2
+5303,view_970,call_function,view.default,backward,13,1,1,1,4683,1458,5
+5304,view_971,call_function,view.default,backward,13,1,1,1,4683,1457,5
+5305,alias_default_1031,call_function,alias.default,backward,13,1,1,2,4675,1457,4
+5306,einsum_default_403,call_function,einsum.default,backward,13,2,2,1,4676,3,5
+5307,permute_783,call_function,permute.default,backward,13,1,1,1,4,1453,3
+5308,einsum_default_404,call_function,einsum.default,backward,13,2,2,1,4677,1452,5
+5309,permute_784,call_function,permute.default,backward,13,1,1,1,4677,2,4
+5310,dtype_cast_388,call_function,dtype_cast.default,backward,13,1,1,1,4678,1,4
+5311,alias_default_1366,call_function,alias.default,backward,13,1,1,0,4679,0,3
+5312,alias_default_1032,call_function,alias.default,backward,13,1,1,2,4684,1457,4
+5313,einsum_default_405,call_function,einsum.default,backward,13,2,2,1,4685,3,5
+5314,permute_787,call_function,permute.default,backward,13,1,1,1,4,1453,3
+5315,einsum_default_406,call_function,einsum.default,backward,13,2,2,1,4686,1452,5
+5316,add_243,call_function,add.Tensor,unknown,,2,2,1,4693,1451,10
+5317,permute_788,call_function,permute.default,backward,13,1,1,1,4686,2,4
+5318,dtype_cast_389,call_function,dtype_cast.default,backward,13,1,1,1,4687,1,4
+5319,alias_default_1365,call_function,alias.default,backward,13,1,1,0,4688,0,3
+5320,alias_default_1033,call_function,alias.default,backward,13,1,1,2,4684,1456,4
+5321,einsum_default_407,call_function,einsum.default,backward,13,2,2,1,4685,3,5
+5322,permute_791,call_function,permute.default,backward,13,1,1,1,4,1452,3
+5323,einsum_default_408,call_function,einsum.default,backward,13,2,2,1,4686,1451,5
+5324,add_244,call_function,add.Tensor,unknown,,2,2,1,4709,1450,10
+5325,permute_792,call_function,permute.default,backward,13,1,1,1,4686,2,4
+5326,dtype_cast_390,call_function,dtype_cast.default,backward,13,1,1,1,4687,1,4
+5327,alias_default_1364,call_function,alias.default,backward,13,1,1,0,4688,0,3
+5328,convert_element_type_1355,call_function,convert_element_type.default,backward,13,1,1,1,4710,1449,8
+5329,convert_element_type_1356,call_function,convert_element_type.default,backward,13,1,1,1,1439,1449,4
+5330,convert_element_type_1357,call_function,convert_element_type.default,backward,13,1,1,1,3,1443,2
+5331,alias_default_1034,call_function,alias.default,backward,13,1,1,2,4711,1448,4
+5332,mul_498,call_function,mul.Tensor,backward,13,2,2,1,4713,1442,8
+5333,mul_499,call_function,mul.Tensor,backward,13,2,2,1,1447,1448,8
+5334,alias_default_1035,call_function,alias.default,backward,13,1,1,2,4714,1441,4
+5335,alias_default_1036,call_function,alias.default,backward,13,1,1,3,1448,1447,4
+5336,mul_500,call_function,mul.Tensor,backward,13,2,2,1,4718,1440,8
+5337,sum_61,call_function,sum.dim_IntList,backward,13,1,1,1,4719,1439,5
+5338,div_58,call_function,div.Tensor,backward,13,1,1,1,1449,1439,6
+5339,mul_501,call_function,mul.Tensor,backward,13,2,2,1,4721,1438,8
+5340,sub_45,call_function,sub.Tensor,backward,13,2,2,1,4722,1437,10
+5341,mul_502,call_function,mul.Tensor,backward,13,2,2,1,4723,1436,8
+5342,mul_503,call_function,mul.Tensor,backward,13,2,2,1,4715,4,8
+5343,sum_62,call_function,sum.dim_IntList,backward,13,1,1,1,4716,3,5
+5344,convert_element_type_1358,call_function,convert_element_type.default,backward,13,1,1,1,4724,1435,6
+5345,convert_element_type_1359,call_function,convert_element_type.default,backward,13,1,1,1,4717,2,3
+5346,add_245,call_function,add.Tensor,unknown,,2,2,1,4725,1434,10
+5347,dtype_cast_391,call_function,dtype_cast.default,backward,13,1,1,1,4718,1,3
+5348,alias_default_1371,call_function,alias.default,backward,13,1,1,0,4719,0,2
+5349,alias_default_1037,call_function,alias.default,unknown,,1,1,3,4726,1433,4
+5350,einsum_default_409,call_function,einsum.default,backward,12,2,2,1,4727,3,5
+5351,permute_795,call_function,permute.default,backward,12,1,1,1,4,1429,3
+5352,einsum_default_410,call_function,einsum.default,backward,12,2,2,1,4728,1428,5
+5353,permute_796,call_function,permute.default,backward,12,1,1,1,4728,2,4
+5354,dtype_cast_392,call_function,dtype_cast.default,backward,12,1,1,1,4729,1,4
+5355,alias_default_1360,call_function,alias.default,backward,12,1,1,0,4730,0,3
+5356,alias_default_1038,call_function,alias.default,backward,12,1,1,2,4729,1427,4
+5357,mul_504,call_function,mul.Tensor,backward,12,2,2,1,4730,1415,8
+5358,mul_505,call_function,mul.Tensor,backward,12,2,2,1,4730,1419,8
+5359,alias_default_1039,call_function,alias.default,backward,12,1,1,2,4731,1414,4
+5360,einsum_default_411,call_function,einsum.default,backward,12,2,2,1,4732,3,5
+5361,permute_799,call_function,permute.default,backward,12,1,1,1,4,1410,3
+5362,einsum_default_412,call_function,einsum.default,backward,12,2,2,1,4733,1409,5
+5363,permute_800,call_function,permute.default,backward,12,1,1,1,4733,2,4
+5364,dtype_cast_393,call_function,dtype_cast.default,backward,12,1,1,1,4734,1,4
+5365,alias_default_1361,call_function,alias.default,backward,12,1,1,0,4735,0,3
+5366,convert_element_type_1368,call_function,convert_element_type.default,backward,12,1,1,1,4731,1418,6
+5367,convert_element_type_1369,call_function,convert_element_type.default,backward,12,1,1,1,1416,1428,4
+5368,alias_default_1040,call_function,alias.default,backward,12,1,1,2,1417,1427,4
+5369,neg_43,call_function,neg.default,backward,12,1,1,1,1418,1426,8
+5370,exp_43,call_function,exp.default,backward,12,1,1,1,1419,1425,6
+5371,add_246,call_function,add.Tensor,backward,12,1,1,1,1420,1424,4
+5372,reciprocal_15,call_function,reciprocal.default,backward,12,1,1,1,1421,1423,4
+5373,mul_506,call_function,mul.Tensor,backward,12,1,1,1,1422,1422,6
+5374,alias_default_1041,call_function,alias.default,backward,12,1,1,2,1423,1421,4
+5375,mul_507,call_function,mul.Tensor,backward,12,2,2,1,4740,1417,8
+5376,sub_46,call_function,sub.Tensor,backward,12,1,1,1,1424,1419,4
+5377,mul_508,call_function,mul.Tensor,backward,12,2,2,1,1425,1418,8
+5378,add_247,call_function,add.Tensor,backward,12,1,1,1,1426,1417,4
+5379,mul_509,call_function,mul.Tensor,backward,12,2,2,1,4744,1416,8
+5380,convert_element_type_1370,call_function,convert_element_type.default,backward,12,1,1,1,4745,1415,6
+5381,alias_default_1042,call_function,alias.default,backward,12,1,1,2,4746,1414,4
+5382,einsum_default_413,call_function,einsum.default,backward,12,2,2,1,4747,3,5
+5383,permute_803,call_function,permute.default,backward,12,1,1,1,4,1410,3
+5384,einsum_default_414,call_function,einsum.default,backward,12,2,2,1,4748,1409,5
+5385,add_248,call_function,add.Tensor,unknown,,2,2,1,4753,1408,10
+5386,permute_804,call_function,permute.default,backward,12,1,1,1,4748,2,4
+5387,dtype_cast_394,call_function,dtype_cast.default,backward,12,1,1,1,4749,1,4
+5388,alias_default_1359,call_function,alias.default,backward,12,1,1,0,4750,0,3
+5389,convert_element_type_1375,call_function,convert_element_type.default,backward,12,1,1,1,4754,1407,8
+5390,convert_element_type_1376,call_function,convert_element_type.default,backward,12,1,1,1,1396,1407,4
+5391,convert_element_type_1377,call_function,convert_element_type.default,backward,12,1,1,1,3,1401,2
+5392,alias_default_1043,call_function,alias.default,backward,12,1,1,2,4755,1406,4
+5393,mul_510,call_function,mul.Tensor,backward,12,2,2,1,4757,1400,8
+5394,mul_511,call_function,mul.Tensor,backward,12,2,2,1,1404,1406,8
+5395,alias_default_1044,call_function,alias.default,backward,12,1,1,2,4758,1399,4
+5396,alias_default_1045,call_function,alias.default,backward,12,1,1,3,1405,1405,4
+5397,mul_512,call_function,mul.Tensor,backward,12,2,2,1,4762,1398,8
+5398,sum_63,call_function,sum.dim_IntList,backward,12,1,1,1,4763,1397,5
+5399,div_59,call_function,div.Tensor,backward,12,1,1,1,1406,1397,6
+5400,mul_513,call_function,mul.Tensor,backward,12,2,2,1,4765,1396,8
+5401,sub_47,call_function,sub.Tensor,backward,12,2,2,1,4766,1395,10
+5402,mul_514,call_function,mul.Tensor,backward,12,2,2,1,4767,1394,8
+5403,mul_515,call_function,mul.Tensor,backward,12,2,2,1,4759,4,8
+5404,sum_64,call_function,sum.dim_IntList,backward,12,1,1,1,4760,3,5
+5405,convert_element_type_1378,call_function,convert_element_type.default,backward,12,1,1,1,4768,1393,6
+5406,convert_element_type_1379,call_function,convert_element_type.default,backward,12,1,1,1,4761,2,3
+5407,add_249,call_function,add.Tensor,unknown,,2,2,1,4769,1392,10
+5408,dtype_cast_395,call_function,dtype_cast.default,backward,12,1,1,1,4762,1,3
+5409,alias_default_1363,call_function,alias.default,backward,12,1,1,0,4763,0,2
+5410,alias_default_1046,call_function,alias.default,unknown,,1,1,3,4770,1391,4
+5411,einsum_default_415,call_function,einsum.default,backward,12,2,2,1,4771,3,5
+5412,permute_807,call_function,permute.default,backward,12,1,1,1,4,1387,3
+5413,einsum_default_416,call_function,einsum.default,backward,12,2,2,1,4772,1386,5
+5414,permute_808,call_function,permute.default,backward,12,1,1,1,4772,2,4
+5415,dtype_cast_396,call_function,dtype_cast.default,backward,12,1,1,1,4773,1,4
+5416,alias_default_1358,call_function,alias.default,backward,12,1,1,0,4774,0,3
+5417,view_986,call_function,view.default,backward,12,1,1,1,4773,1385,4
+5418,permute_809,call_function,permute.default,backward,12,1,1,1,4774,1384,4
+5419,_scaled_dot_product_flash_attention_backward_15,call_function,_scaled_dot_product_flash_attention_backward.default,backward,12,8,8,3,4778,1383,2
+5420,getitem_297,call_function,getitem,backward,12,1,1,1,4779,1356,2
+5421,getitem_298,call_function,getitem,backward,12,1,1,1,4779,1357,2
+5422,getitem_299,call_function,getitem,backward,12,1,1,1,4779,1350,2
+5423,permute_810,call_function,permute.default,backward,12,1,1,1,4780,1349,2
+5424,permute_811,call_function,permute.default,backward,12,1,1,1,4780,1356,2
+5425,permute_812,call_function,permute.default,backward,12,1,1,1,4780,1355,2
+5426,convert_element_type_1384,call_function,convert_element_type.default,backward,12,1,1,1,4781,1355,2
+5427,convert_element_type_1385,call_function,convert_element_type.default,backward,12,1,1,1,4781,1354,2
+5428,view_987,call_function,view.default,backward,12,1,1,1,4782,1354,2
+5429,view_as_complex_86,call_function,view_as_complex.default,backward,12,1,1,1,4783,1353,6
+5430,_conj_30,call_function,_conj.default,backward,12,1,1,1,4,1354,3
+5431,clone_126,call_function,clone.default,backward,12,1,1,1,5,1353,3
+5432,mul_516,call_function,mul.Tensor,backward,12,2,2,1,4786,1352,8
+5433,view_988,call_function,view.default,backward,12,1,1,1,4782,1353,2
+5434,view_as_complex_87,call_function,view_as_complex.default,backward,12,1,1,1,4783,1352,6
+5435,_conj_31,call_function,_conj.default,backward,12,1,1,1,4,1353,3
+5436,clone_127,call_function,clone.default,backward,12,1,1,1,5,1352,3
+5437,mul_517,call_function,mul.Tensor,backward,12,2,2,1,4786,1351,8
+5438,view_as_real_86,call_function,view_as_real.default,backward,12,1,1,1,4787,1351,6
+5439,view_989,call_function,view.default,backward,12,1,1,1,4788,1350,6
+5440,convert_element_type_1386,call_function,convert_element_type.default,backward,12,1,1,1,4789,1349,6
+5441,view_as_real_87,call_function,view_as_real.default,backward,12,1,1,1,4787,1350,6
+5442,view_990,call_function,view.default,backward,12,1,1,1,4788,1349,6
+5443,convert_element_type_1387,call_function,convert_element_type.default,backward,12,1,1,1,4789,1348,6
+5444,view_991,call_function,view.default,backward,12,1,1,1,4781,1348,2
+5445,view_992,call_function,view.default,backward,12,1,1,1,4790,1348,5
+5446,view_993,call_function,view.default,backward,12,1,1,1,4790,1347,5
+5447,alias_default_1047,call_function,alias.default,backward,12,1,1,2,4782,1347,4
+5448,einsum_default_417,call_function,einsum.default,backward,12,2,2,1,4783,3,5
+5449,permute_815,call_function,permute.default,backward,12,1,1,1,4,1343,3
+5450,einsum_default_418,call_function,einsum.default,backward,12,2,2,1,4784,1342,5
+5451,permute_816,call_function,permute.default,backward,12,1,1,1,4784,2,4
+5452,dtype_cast_397,call_function,dtype_cast.default,backward,12,1,1,1,4785,1,4
+5453,alias_default_1357,call_function,alias.default,backward,12,1,1,0,4786,0,3
+5454,alias_default_1048,call_function,alias.default,backward,12,1,1,2,4791,1347,4
+5455,einsum_default_419,call_function,einsum.default,backward,12,2,2,1,4792,3,5
+5456,permute_819,call_function,permute.default,backward,12,1,1,1,4,1343,3
+5457,einsum_default_420,call_function,einsum.default,backward,12,2,2,1,4793,1342,5
+5458,add_250,call_function,add.Tensor,unknown,,2,2,1,4800,1341,10
+5459,permute_820,call_function,permute.default,backward,12,1,1,1,4793,2,4
+5460,dtype_cast_398,call_function,dtype_cast.default,backward,12,1,1,1,4794,1,4
+5461,alias_default_1356,call_function,alias.default,backward,12,1,1,0,4795,0,3
+5462,alias_default_1049,call_function,alias.default,backward,12,1,1,2,4791,1346,4
+5463,einsum_default_421,call_function,einsum.default,backward,12,2,2,1,4792,3,5
+5464,permute_823,call_function,permute.default,backward,12,1,1,1,4,1342,3
+5465,einsum_default_422,call_function,einsum.default,backward,12,2,2,1,4793,1341,5
+5466,add_251,call_function,add.Tensor,unknown,,2,2,1,4816,1340,10
+5467,permute_824,call_function,permute.default,backward,12,1,1,1,4793,2,4
+5468,dtype_cast_399,call_function,dtype_cast.default,backward,12,1,1,1,4794,1,4
+5469,alias_default_1355,call_function,alias.default,backward,12,1,1,0,4795,0,3
+5470,convert_element_type_1400,call_function,convert_element_type.default,backward,12,1,1,1,4817,1339,8
+5471,convert_element_type_1401,call_function,convert_element_type.default,backward,12,1,1,1,1329,1339,4
+5472,convert_element_type_1402,call_function,convert_element_type.default,backward,12,1,1,1,3,1333,2
+5473,alias_default_1050,call_function,alias.default,backward,12,1,1,2,4818,1338,4
+5474,mul_518,call_function,mul.Tensor,backward,12,2,2,1,4820,1332,8
+5475,mul_519,call_function,mul.Tensor,backward,12,2,2,1,1337,1338,8
+5476,alias_default_1051,call_function,alias.default,backward,12,1,1,2,4821,1331,4
+5477,alias_default_1052,call_function,alias.default,backward,12,1,1,3,1338,1337,4
+5478,mul_520,call_function,mul.Tensor,backward,12,2,2,1,4825,1330,8
+5479,sum_65,call_function,sum.dim_IntList,backward,12,1,1,1,4826,1329,5
+5480,div_60,call_function,div.Tensor,backward,12,1,1,1,1339,1329,6
+5481,mul_521,call_function,mul.Tensor,backward,12,2,2,1,4828,1328,8
+5482,sub_48,call_function,sub.Tensor,backward,12,2,2,1,4829,1327,10
+5483,mul_522,call_function,mul.Tensor,backward,12,2,2,1,4830,1326,8
+5484,mul_523,call_function,mul.Tensor,backward,12,2,2,1,4822,4,8
+5485,sum_66,call_function,sum.dim_IntList,backward,12,1,1,1,4823,3,5
+5486,convert_element_type_1403,call_function,convert_element_type.default,backward,12,1,1,1,4831,1325,6
+5487,convert_element_type_1404,call_function,convert_element_type.default,backward,12,1,1,1,4824,2,3
+5488,add_252,call_function,add.Tensor,unknown,,2,2,1,4832,1324,10
+5489,dtype_cast_400,call_function,dtype_cast.default,backward,12,1,1,1,4825,1,3
+5490,alias_default_1362,call_function,alias.default,backward,12,1,1,0,4826,0,2
+5491,alias_default_1053,call_function,alias.default,unknown,,1,1,3,4833,1323,4
+5492,einsum_default_423,call_function,einsum.default,backward,11,2,2,1,4834,3,5
+5493,permute_827,call_function,permute.default,backward,11,1,1,1,4,1319,3
+5494,einsum_default_424,call_function,einsum.default,backward,11,2,2,1,4835,1318,5
+5495,permute_828,call_function,permute.default,backward,11,1,1,1,4835,2,4
+5496,dtype_cast_401,call_function,dtype_cast.default,backward,11,1,1,1,4836,1,4
+5497,alias_default_1351,call_function,alias.default,backward,11,1,1,0,4837,0,3
+5498,alias_default_1054,call_function,alias.default,backward,11,1,1,2,4836,1317,4
+5499,mul_524,call_function,mul.Tensor,backward,11,2,2,1,4837,1305,8
+5500,mul_525,call_function,mul.Tensor,backward,11,2,2,1,4837,1309,8
+5501,alias_default_1055,call_function,alias.default,backward,11,1,1,2,4838,1304,4
+5502,einsum_default_425,call_function,einsum.default,backward,11,2,2,1,4839,3,5
+5503,permute_831,call_function,permute.default,backward,11,1,1,1,4,1300,3
+5504,einsum_default_426,call_function,einsum.default,backward,11,2,2,1,4840,1299,5
+5505,permute_832,call_function,permute.default,backward,11,1,1,1,4840,2,4
+5506,dtype_cast_402,call_function,dtype_cast.default,backward,11,1,1,1,4841,1,4
+5507,alias_default_1352,call_function,alias.default,backward,11,1,1,0,4842,0,3
+5508,convert_element_type_1413,call_function,convert_element_type.default,backward,11,1,1,1,4838,1308,6
+5509,convert_element_type_1414,call_function,convert_element_type.default,backward,11,1,1,1,1306,1318,4
+5510,alias_default_1056,call_function,alias.default,backward,11,1,1,2,1307,1317,4
+5511,neg_44,call_function,neg.default,backward,11,1,1,1,1308,1316,8
+5512,exp_44,call_function,exp.default,backward,11,1,1,1,1309,1315,6
+5513,add_253,call_function,add.Tensor,backward,11,1,1,1,1310,1314,4
+5514,reciprocal_16,call_function,reciprocal.default,backward,11,1,1,1,1311,1313,4
+5515,mul_526,call_function,mul.Tensor,backward,11,1,1,1,1312,1312,6
+5516,alias_default_1057,call_function,alias.default,backward,11,1,1,2,1313,1311,4
+5517,mul_527,call_function,mul.Tensor,backward,11,2,2,1,4847,1307,8
+5518,sub_49,call_function,sub.Tensor,backward,11,1,1,1,1314,1309,4
+5519,mul_528,call_function,mul.Tensor,backward,11,2,2,1,1315,1308,8
+5520,add_254,call_function,add.Tensor,backward,11,1,1,1,1316,1307,4
+5521,mul_529,call_function,mul.Tensor,backward,11,2,2,1,4851,1306,8
+5522,convert_element_type_1415,call_function,convert_element_type.default,backward,11,1,1,1,4852,1305,6
+5523,alias_default_1058,call_function,alias.default,backward,11,1,1,2,4853,1304,4
+5524,einsum_default_427,call_function,einsum.default,backward,11,2,2,1,4854,3,5
+5525,permute_835,call_function,permute.default,backward,11,1,1,1,4,1300,3
+5526,einsum_default_428,call_function,einsum.default,backward,11,2,2,1,4855,1299,5
+5527,add_255,call_function,add.Tensor,unknown,,2,2,1,4860,1298,10
+5528,permute_836,call_function,permute.default,backward,11,1,1,1,4855,2,4
+5529,dtype_cast_403,call_function,dtype_cast.default,backward,11,1,1,1,4856,1,4
+5530,alias_default_1350,call_function,alias.default,backward,11,1,1,0,4857,0,3
+5531,convert_element_type_1420,call_function,convert_element_type.default,backward,11,1,1,1,4861,1297,8
+5532,convert_element_type_1421,call_function,convert_element_type.default,backward,11,1,1,1,1286,1297,4
+5533,convert_element_type_1422,call_function,convert_element_type.default,backward,11,1,1,1,3,1291,2
+5534,alias_default_1059,call_function,alias.default,backward,11,1,1,2,4862,1296,4
+5535,mul_530,call_function,mul.Tensor,backward,11,2,2,1,4864,1290,8
+5536,mul_531,call_function,mul.Tensor,backward,11,2,2,1,1294,1296,8
+5537,alias_default_1060,call_function,alias.default,backward,11,1,1,2,4865,1289,4
+5538,alias_default_1061,call_function,alias.default,backward,11,1,1,3,1295,1295,4
+5539,mul_532,call_function,mul.Tensor,backward,11,2,2,1,4869,1288,8
+5540,sum_67,call_function,sum.dim_IntList,backward,11,1,1,1,4870,1287,5
+5541,div_61,call_function,div.Tensor,backward,11,1,1,1,1296,1287,6
+5542,mul_533,call_function,mul.Tensor,backward,11,2,2,1,4872,1286,8
+5543,sub_50,call_function,sub.Tensor,backward,11,2,2,1,4873,1285,10
+5544,mul_534,call_function,mul.Tensor,backward,11,2,2,1,4874,1284,8
+5545,mul_535,call_function,mul.Tensor,backward,11,2,2,1,4866,4,8
+5546,sum_68,call_function,sum.dim_IntList,backward,11,1,1,1,4867,3,5
+5547,convert_element_type_1423,call_function,convert_element_type.default,backward,11,1,1,1,4875,1283,6
+5548,convert_element_type_1424,call_function,convert_element_type.default,backward,11,1,1,1,4868,2,3
+5549,add_256,call_function,add.Tensor,unknown,,2,2,1,4876,1282,10
+5550,dtype_cast_404,call_function,dtype_cast.default,backward,11,1,1,1,4869,1,3
+5551,alias_default_1354,call_function,alias.default,backward,11,1,1,0,4870,0,2
+5552,alias_default_1062,call_function,alias.default,unknown,,1,1,3,4877,1281,4
+5553,einsum_default_429,call_function,einsum.default,backward,11,2,2,1,4878,3,5
+5554,permute_839,call_function,permute.default,backward,11,1,1,1,4,1277,3
+5555,einsum_default_430,call_function,einsum.default,backward,11,2,2,1,4879,1276,5
+5556,permute_840,call_function,permute.default,backward,11,1,1,1,4879,2,4
+5557,dtype_cast_405,call_function,dtype_cast.default,backward,11,1,1,1,4880,1,4
+5558,alias_default_1349,call_function,alias.default,backward,11,1,1,0,4881,0,3
+5559,view_1008,call_function,view.default,backward,11,1,1,1,4880,1275,4
+5560,permute_841,call_function,permute.default,backward,11,1,1,1,4881,1274,4
+5561,_scaled_dot_product_flash_attention_backward_16,call_function,_scaled_dot_product_flash_attention_backward.default,backward,11,8,8,3,4885,1273,2
+5562,getitem_300,call_function,getitem,backward,11,1,1,1,4886,1246,2
+5563,getitem_301,call_function,getitem,backward,11,1,1,1,4886,1247,2
+5564,getitem_302,call_function,getitem,backward,11,1,1,1,4886,1240,2
+5565,permute_842,call_function,permute.default,backward,11,1,1,1,4887,1239,2
+5566,permute_843,call_function,permute.default,backward,11,1,1,1,4887,1246,2
+5567,permute_844,call_function,permute.default,backward,11,1,1,1,4887,1245,2
+5568,convert_element_type_1429,call_function,convert_element_type.default,backward,11,1,1,1,4888,1245,2
+5569,convert_element_type_1430,call_function,convert_element_type.default,backward,11,1,1,1,4888,1244,2
+5570,view_1009,call_function,view.default,backward,11,1,1,1,4889,1244,2
+5571,view_as_complex_88,call_function,view_as_complex.default,backward,11,1,1,1,4890,1243,6
+5572,_conj_32,call_function,_conj.default,backward,11,1,1,1,4,1244,3
+5573,clone_134,call_function,clone.default,backward,11,1,1,1,5,1243,3
+5574,mul_536,call_function,mul.Tensor,backward,11,2,2,1,4893,1242,8
+5575,view_1010,call_function,view.default,backward,11,1,1,1,4889,1243,2
+5576,view_as_complex_89,call_function,view_as_complex.default,backward,11,1,1,1,4890,1242,6
+5577,_conj_33,call_function,_conj.default,backward,11,1,1,1,4,1243,3
+5578,clone_135,call_function,clone.default,backward,11,1,1,1,5,1242,3
+5579,mul_537,call_function,mul.Tensor,backward,11,2,2,1,4893,1241,8
+5580,view_as_real_88,call_function,view_as_real.default,backward,11,1,1,1,4894,1241,6
+5581,view_1011,call_function,view.default,backward,11,1,1,1,4895,1240,6
+5582,convert_element_type_1431,call_function,convert_element_type.default,backward,11,1,1,1,4896,1239,6
+5583,view_as_real_89,call_function,view_as_real.default,backward,11,1,1,1,4894,1240,6
+5584,view_1012,call_function,view.default,backward,11,1,1,1,4895,1239,6
+5585,convert_element_type_1432,call_function,convert_element_type.default,backward,11,1,1,1,4896,1238,6
+5586,view_1013,call_function,view.default,backward,11,1,1,1,4888,1238,2
+5587,view_1014,call_function,view.default,backward,11,1,1,1,4897,1238,5
+5588,view_1015,call_function,view.default,backward,11,1,1,1,4897,1237,5
+5589,alias_default_1063,call_function,alias.default,backward,11,1,1,2,4889,1237,4
+5590,einsum_default_431,call_function,einsum.default,backward,11,2,2,1,4890,3,5
+5591,permute_847,call_function,permute.default,backward,11,1,1,1,4,1233,3
+5592,einsum_default_432,call_function,einsum.default,backward,11,2,2,1,4891,1232,5
+5593,permute_848,call_function,permute.default,backward,11,1,1,1,4891,2,4
+5594,dtype_cast_406,call_function,dtype_cast.default,backward,11,1,1,1,4892,1,4
+5595,alias_default_1348,call_function,alias.default,backward,11,1,1,0,4893,0,3
+5596,alias_default_1064,call_function,alias.default,backward,11,1,1,2,4898,1237,4
+5597,einsum_default_433,call_function,einsum.default,backward,11,2,2,1,4899,3,5
+5598,permute_851,call_function,permute.default,backward,11,1,1,1,4,1233,3
+5599,einsum_default_434,call_function,einsum.default,backward,11,2,2,1,4900,1232,5
+5600,add_257,call_function,add.Tensor,unknown,,2,2,1,4907,1231,10
+5601,permute_852,call_function,permute.default,backward,11,1,1,1,4900,2,4
+5602,dtype_cast_407,call_function,dtype_cast.default,backward,11,1,1,1,4901,1,4
+5603,alias_default_1347,call_function,alias.default,backward,11,1,1,0,4902,0,3
+5604,alias_default_1065,call_function,alias.default,backward,11,1,1,2,4898,1236,4
+5605,einsum_default_435,call_function,einsum.default,backward,11,2,2,1,4899,3,5
+5606,permute_855,call_function,permute.default,backward,11,1,1,1,4,1232,3
+5607,einsum_default_436,call_function,einsum.default,backward,11,2,2,1,4900,1231,5
+5608,add_258,call_function,add.Tensor,unknown,,2,2,1,4923,1230,10
+5609,permute_856,call_function,permute.default,backward,11,1,1,1,4900,2,4
+5610,dtype_cast_408,call_function,dtype_cast.default,backward,11,1,1,1,4901,1,4
+5611,alias_default_1346,call_function,alias.default,backward,11,1,1,0,4902,0,3
+5612,convert_element_type_1445,call_function,convert_element_type.default,backward,11,1,1,1,4924,1229,8
+5613,convert_element_type_1446,call_function,convert_element_type.default,backward,11,1,1,1,1219,1229,4
+5614,convert_element_type_1447,call_function,convert_element_type.default,backward,11,1,1,1,3,1223,2
+5615,alias_default_1066,call_function,alias.default,backward,11,1,1,2,4925,1228,4
+5616,mul_538,call_function,mul.Tensor,backward,11,2,2,1,4927,1222,8
+5617,mul_539,call_function,mul.Tensor,backward,11,2,2,1,1227,1228,8
+5618,alias_default_1067,call_function,alias.default,backward,11,1,1,2,4928,1221,4
+5619,alias_default_1068,call_function,alias.default,backward,11,1,1,3,1228,1227,4
+5620,mul_540,call_function,mul.Tensor,backward,11,2,2,1,4932,1220,8
+5621,sum_69,call_function,sum.dim_IntList,backward,11,1,1,1,4933,1219,5
+5622,div_62,call_function,div.Tensor,backward,11,1,1,1,1229,1219,6
+5623,mul_541,call_function,mul.Tensor,backward,11,2,2,1,4935,1218,8
+5624,sub_51,call_function,sub.Tensor,backward,11,2,2,1,4936,1217,10
+5625,mul_542,call_function,mul.Tensor,backward,11,2,2,1,4937,1216,8
+5626,mul_543,call_function,mul.Tensor,backward,11,2,2,1,4929,4,8
+5627,sum_70,call_function,sum.dim_IntList,backward,11,1,1,1,4930,3,5
+5628,convert_element_type_1448,call_function,convert_element_type.default,backward,11,1,1,1,4938,1215,6
+5629,convert_element_type_1449,call_function,convert_element_type.default,backward,11,1,1,1,4931,2,3
+5630,add_259,call_function,add.Tensor,unknown,,2,2,1,4939,1214,10
+5631,dtype_cast_409,call_function,dtype_cast.default,backward,11,1,1,1,4932,1,3
+5632,alias_default_1353,call_function,alias.default,backward,11,1,1,0,4933,0,2
+5633,alias_default_1069,call_function,alias.default,unknown,,1,1,3,4940,1213,4
+5634,einsum_default_437,call_function,einsum.default,backward,10,2,2,1,4941,3,5
+5635,permute_859,call_function,permute.default,backward,10,1,1,1,4,1209,3
+5636,einsum_default_438,call_function,einsum.default,backward,10,2,2,1,4942,1208,5
+5637,permute_860,call_function,permute.default,backward,10,1,1,1,4942,2,4
+5638,dtype_cast_410,call_function,dtype_cast.default,backward,10,1,1,1,4943,1,4
+5639,alias_default_1342,call_function,alias.default,backward,10,1,1,0,4944,0,3
+5640,alias_default_1070,call_function,alias.default,backward,10,1,1,2,4943,1207,4
+5641,mul_544,call_function,mul.Tensor,backward,10,2,2,1,4944,1195,8
+5642,mul_545,call_function,mul.Tensor,backward,10,2,2,1,4944,1199,8
+5643,alias_default_1071,call_function,alias.default,backward,10,1,1,2,4945,1194,4
+5644,einsum_default_439,call_function,einsum.default,backward,10,2,2,1,4946,3,5
+5645,permute_863,call_function,permute.default,backward,10,1,1,1,4,1190,3
+5646,einsum_default_440,call_function,einsum.default,backward,10,2,2,1,4947,1189,5
+5647,permute_864,call_function,permute.default,backward,10,1,1,1,4947,2,4
+5648,dtype_cast_411,call_function,dtype_cast.default,backward,10,1,1,1,4948,1,4
+5649,alias_default_1343,call_function,alias.default,backward,10,1,1,0,4949,0,3
+5650,convert_element_type_1458,call_function,convert_element_type.default,backward,10,1,1,1,4945,1198,6
+5651,convert_element_type_1459,call_function,convert_element_type.default,backward,10,1,1,1,1196,1208,4
+5652,alias_default_1072,call_function,alias.default,backward,10,1,1,2,1197,1207,4
+5653,neg_45,call_function,neg.default,backward,10,1,1,1,1198,1206,8
+5654,exp_45,call_function,exp.default,backward,10,1,1,1,1199,1205,6
+5655,add_260,call_function,add.Tensor,backward,10,1,1,1,1200,1204,4
+5656,reciprocal_17,call_function,reciprocal.default,backward,10,1,1,1,1201,1203,4
+5657,mul_546,call_function,mul.Tensor,backward,10,1,1,1,1202,1202,6
+5658,alias_default_1073,call_function,alias.default,backward,10,1,1,2,1203,1201,4
+5659,mul_547,call_function,mul.Tensor,backward,10,2,2,1,4954,1197,8
+5660,sub_52,call_function,sub.Tensor,backward,10,1,1,1,1204,1199,4
+5661,mul_548,call_function,mul.Tensor,backward,10,2,2,1,1205,1198,8
+5662,add_261,call_function,add.Tensor,backward,10,1,1,1,1206,1197,4
+5663,mul_549,call_function,mul.Tensor,backward,10,2,2,1,4958,1196,8
+5664,convert_element_type_1460,call_function,convert_element_type.default,backward,10,1,1,1,4959,1195,6
+5665,alias_default_1074,call_function,alias.default,backward,10,1,1,2,4960,1194,4
+5666,einsum_default_441,call_function,einsum.default,backward,10,2,2,1,4961,3,5
+5667,permute_867,call_function,permute.default,backward,10,1,1,1,4,1190,3
+5668,einsum_default_442,call_function,einsum.default,backward,10,2,2,1,4962,1189,5
+5669,add_262,call_function,add.Tensor,unknown,,2,2,1,4967,1188,10
+5670,permute_868,call_function,permute.default,backward,10,1,1,1,4962,2,4
+5671,dtype_cast_412,call_function,dtype_cast.default,backward,10,1,1,1,4963,1,4
+5672,alias_default_1341,call_function,alias.default,backward,10,1,1,0,4964,0,3
+5673,convert_element_type_1465,call_function,convert_element_type.default,backward,10,1,1,1,4968,1187,8
+5674,convert_element_type_1466,call_function,convert_element_type.default,backward,10,1,1,1,1176,1187,4
+5675,convert_element_type_1467,call_function,convert_element_type.default,backward,10,1,1,1,3,1181,2
+5676,alias_default_1075,call_function,alias.default,backward,10,1,1,2,4969,1186,4
+5677,mul_550,call_function,mul.Tensor,backward,10,2,2,1,4971,1180,8
+5678,mul_551,call_function,mul.Tensor,backward,10,2,2,1,1184,1186,8
+5679,alias_default_1076,call_function,alias.default,backward,10,1,1,2,4972,1179,4
+5680,alias_default_1077,call_function,alias.default,backward,10,1,1,3,1185,1185,4
+5681,mul_552,call_function,mul.Tensor,backward,10,2,2,1,4976,1178,8
+5682,sum_71,call_function,sum.dim_IntList,backward,10,1,1,1,4977,1177,5
+5683,div_63,call_function,div.Tensor,backward,10,1,1,1,1186,1177,6
+5684,mul_553,call_function,mul.Tensor,backward,10,2,2,1,4979,1176,8
+5685,sub_53,call_function,sub.Tensor,backward,10,2,2,1,4980,1175,10
+5686,mul_554,call_function,mul.Tensor,backward,10,2,2,1,4981,1174,8
+5687,mul_555,call_function,mul.Tensor,backward,10,2,2,1,4973,4,8
+5688,sum_72,call_function,sum.dim_IntList,backward,10,1,1,1,4974,3,5
+5689,convert_element_type_1468,call_function,convert_element_type.default,backward,10,1,1,1,4982,1173,6
+5690,convert_element_type_1469,call_function,convert_element_type.default,backward,10,1,1,1,4975,2,3
+5691,add_263,call_function,add.Tensor,unknown,,2,2,1,4983,1172,10
+5692,dtype_cast_413,call_function,dtype_cast.default,backward,10,1,1,1,4976,1,3
+5693,alias_default_1345,call_function,alias.default,backward,10,1,1,0,4977,0,2
+5694,alias_default_1078,call_function,alias.default,unknown,,1,1,3,4984,1171,4
+5695,einsum_default_443,call_function,einsum.default,backward,10,2,2,1,4985,3,5
+5696,permute_871,call_function,permute.default,backward,10,1,1,1,4,1167,3
+5697,einsum_default_444,call_function,einsum.default,backward,10,2,2,1,4986,1166,5
+5698,permute_872,call_function,permute.default,backward,10,1,1,1,4986,2,4
+5699,dtype_cast_414,call_function,dtype_cast.default,backward,10,1,1,1,4987,1,4
+5700,alias_default_1340,call_function,alias.default,backward,10,1,1,0,4988,0,3
+5701,view_1030,call_function,view.default,backward,10,1,1,1,4987,1165,4
+5702,permute_873,call_function,permute.default,backward,10,1,1,1,4988,1164,4
+5703,_scaled_dot_product_flash_attention_backward_17,call_function,_scaled_dot_product_flash_attention_backward.default,backward,10,8,8,3,4992,1163,2
+5704,getitem_303,call_function,getitem,backward,10,1,1,1,4993,1136,2
+5705,getitem_304,call_function,getitem,backward,10,1,1,1,4993,1137,2
+5706,getitem_305,call_function,getitem,backward,10,1,1,1,4993,1130,2
+5707,permute_874,call_function,permute.default,backward,10,1,1,1,4994,1129,2
+5708,permute_875,call_function,permute.default,backward,10,1,1,1,4994,1136,2
+5709,permute_876,call_function,permute.default,backward,10,1,1,1,4994,1135,2
+5710,convert_element_type_1474,call_function,convert_element_type.default,backward,10,1,1,1,4995,1135,2
+5711,convert_element_type_1475,call_function,convert_element_type.default,backward,10,1,1,1,4995,1134,2
+5712,view_1031,call_function,view.default,backward,10,1,1,1,4996,1134,2
+5713,view_as_complex_90,call_function,view_as_complex.default,backward,10,1,1,1,4997,1133,6
+5714,_conj_34,call_function,_conj.default,backward,10,1,1,1,4,1134,3
+5715,clone_142,call_function,clone.default,backward,10,1,1,1,5,1133,3
+5716,mul_556,call_function,mul.Tensor,backward,10,2,2,1,5000,1132,8
+5717,view_1032,call_function,view.default,backward,10,1,1,1,4996,1133,2
+5718,view_as_complex_91,call_function,view_as_complex.default,backward,10,1,1,1,4997,1132,6
+5719,_conj_35,call_function,_conj.default,backward,10,1,1,1,4,1133,3
+5720,clone_143,call_function,clone.default,backward,10,1,1,1,5,1132,3
+5721,mul_557,call_function,mul.Tensor,backward,10,2,2,1,5000,1131,8
+5722,view_as_real_90,call_function,view_as_real.default,backward,10,1,1,1,5001,1131,6
+5723,view_1033,call_function,view.default,backward,10,1,1,1,5002,1130,6
+5724,convert_element_type_1476,call_function,convert_element_type.default,backward,10,1,1,1,5003,1129,6
+5725,view_as_real_91,call_function,view_as_real.default,backward,10,1,1,1,5001,1130,6
+5726,view_1034,call_function,view.default,backward,10,1,1,1,5002,1129,6
+5727,convert_element_type_1477,call_function,convert_element_type.default,backward,10,1,1,1,5003,1128,6
+5728,view_1035,call_function,view.default,backward,10,1,1,1,4995,1128,2
+5729,view_1036,call_function,view.default,backward,10,1,1,1,5004,1128,5
+5730,view_1037,call_function,view.default,backward,10,1,1,1,5004,1127,5
+5731,alias_default_1079,call_function,alias.default,backward,10,1,1,2,4996,1127,4
+5732,einsum_default_445,call_function,einsum.default,backward,10,2,2,1,4997,3,5
+5733,permute_879,call_function,permute.default,backward,10,1,1,1,4,1123,3
+5734,einsum_default_446,call_function,einsum.default,backward,10,2,2,1,4998,1122,5
+5735,permute_880,call_function,permute.default,backward,10,1,1,1,4998,2,4
+5736,dtype_cast_415,call_function,dtype_cast.default,backward,10,1,1,1,4999,1,4
+5737,alias_default_1339,call_function,alias.default,backward,10,1,1,0,5000,0,3
+5738,alias_default_1080,call_function,alias.default,backward,10,1,1,2,5005,1127,4
+5739,einsum_default_447,call_function,einsum.default,backward,10,2,2,1,5006,3,5
+5740,permute_883,call_function,permute.default,backward,10,1,1,1,4,1123,3
+5741,einsum_default_448,call_function,einsum.default,backward,10,2,2,1,5007,1122,5
+5742,add_264,call_function,add.Tensor,unknown,,2,2,1,5014,1121,10
+5743,permute_884,call_function,permute.default,backward,10,1,1,1,5007,2,4
+5744,dtype_cast_416,call_function,dtype_cast.default,backward,10,1,1,1,5008,1,4
+5745,alias_default_1338,call_function,alias.default,backward,10,1,1,0,5009,0,3
+5746,alias_default_1081,call_function,alias.default,backward,10,1,1,2,5005,1126,4
+5747,einsum_default_449,call_function,einsum.default,backward,10,2,2,1,5006,3,5
+5748,permute_887,call_function,permute.default,backward,10,1,1,1,4,1122,3
+5749,einsum_default_450,call_function,einsum.default,backward,10,2,2,1,5007,1121,5
+5750,add_265,call_function,add.Tensor,unknown,,2,2,1,5030,1120,10
+5751,permute_888,call_function,permute.default,backward,10,1,1,1,5007,2,4
+5752,dtype_cast_417,call_function,dtype_cast.default,backward,10,1,1,1,5008,1,4
+5753,alias_default_1337,call_function,alias.default,backward,10,1,1,0,5009,0,3
+5754,convert_element_type_1490,call_function,convert_element_type.default,backward,10,1,1,1,5031,1119,8
+5755,convert_element_type_1491,call_function,convert_element_type.default,backward,10,1,1,1,1109,1119,4
+5756,convert_element_type_1492,call_function,convert_element_type.default,backward,10,1,1,1,3,1113,2
+5757,alias_default_1082,call_function,alias.default,backward,10,1,1,2,5032,1118,4
+5758,mul_558,call_function,mul.Tensor,backward,10,2,2,1,5034,1112,8
+5759,mul_559,call_function,mul.Tensor,backward,10,2,2,1,1117,1118,8
+5760,alias_default_1083,call_function,alias.default,backward,10,1,1,2,5035,1111,4
+5761,alias_default_1084,call_function,alias.default,backward,10,1,1,3,1118,1117,4
+5762,mul_560,call_function,mul.Tensor,backward,10,2,2,1,5039,1110,8
+5763,sum_73,call_function,sum.dim_IntList,backward,10,1,1,1,5040,1109,5
+5764,div_64,call_function,div.Tensor,backward,10,1,1,1,1119,1109,6
+5765,mul_561,call_function,mul.Tensor,backward,10,2,2,1,5042,1108,8
+5766,sub_54,call_function,sub.Tensor,backward,10,2,2,1,5043,1107,10
+5767,mul_562,call_function,mul.Tensor,backward,10,2,2,1,5044,1106,8
+5768,mul_563,call_function,mul.Tensor,backward,10,2,2,1,5036,4,8
+5769,sum_74,call_function,sum.dim_IntList,backward,10,1,1,1,5037,3,5
+5770,convert_element_type_1493,call_function,convert_element_type.default,backward,10,1,1,1,5045,1105,6
+5771,convert_element_type_1494,call_function,convert_element_type.default,backward,10,1,1,1,5038,2,3
+5772,add_266,call_function,add.Tensor,unknown,,2,2,1,5046,1104,10
+5773,dtype_cast_418,call_function,dtype_cast.default,backward,10,1,1,1,5039,1,3
+5774,alias_default_1344,call_function,alias.default,backward,10,1,1,0,5040,0,2
+5775,alias_default_1085,call_function,alias.default,unknown,,1,1,3,5047,1103,4
+5776,einsum_default_451,call_function,einsum.default,backward,9,2,2,1,5048,3,5
+5777,permute_891,call_function,permute.default,backward,9,1,1,1,4,1099,3
+5778,einsum_default_452,call_function,einsum.default,backward,9,2,2,1,5049,1098,5
+5779,permute_892,call_function,permute.default,backward,9,1,1,1,5049,2,4
+5780,dtype_cast_419,call_function,dtype_cast.default,backward,9,1,1,1,5050,1,4
+5781,alias_default_1333,call_function,alias.default,backward,9,1,1,0,5051,0,3
+5782,alias_default_1086,call_function,alias.default,backward,9,1,1,2,5050,1097,4
+5783,mul_564,call_function,mul.Tensor,backward,9,2,2,1,5051,1085,8
+5784,mul_565,call_function,mul.Tensor,backward,9,2,2,1,5051,1089,8
+5785,alias_default_1087,call_function,alias.default,backward,9,1,1,2,5052,1084,4
+5786,einsum_default_453,call_function,einsum.default,backward,9,2,2,1,5053,3,5
+5787,permute_895,call_function,permute.default,backward,9,1,1,1,4,1080,3
+5788,einsum_default_454,call_function,einsum.default,backward,9,2,2,1,5054,1079,5
+5789,permute_896,call_function,permute.default,backward,9,1,1,1,5054,2,4
+5790,dtype_cast_420,call_function,dtype_cast.default,backward,9,1,1,1,5055,1,4
+5791,alias_default_1334,call_function,alias.default,backward,9,1,1,0,5056,0,3
+5792,convert_element_type_1503,call_function,convert_element_type.default,backward,9,1,1,1,5052,1088,6
+5793,convert_element_type_1504,call_function,convert_element_type.default,backward,9,1,1,1,1086,1098,4
+5794,alias_default_1088,call_function,alias.default,backward,9,1,1,2,1087,1097,4
+5795,neg_46,call_function,neg.default,backward,9,1,1,1,1088,1096,8
+5796,exp_46,call_function,exp.default,backward,9,1,1,1,1089,1095,6
+5797,add_267,call_function,add.Tensor,backward,9,1,1,1,1090,1094,4
+5798,reciprocal_18,call_function,reciprocal.default,backward,9,1,1,1,1091,1093,4
+5799,mul_566,call_function,mul.Tensor,backward,9,1,1,1,1092,1092,6
+5800,alias_default_1089,call_function,alias.default,backward,9,1,1,2,1093,1091,4
+5801,mul_567,call_function,mul.Tensor,backward,9,2,2,1,5061,1087,8
+5802,sub_55,call_function,sub.Tensor,backward,9,1,1,1,1094,1089,4
+5803,mul_568,call_function,mul.Tensor,backward,9,2,2,1,1095,1088,8
+5804,add_268,call_function,add.Tensor,backward,9,1,1,1,1096,1087,4
+5805,mul_569,call_function,mul.Tensor,backward,9,2,2,1,5065,1086,8
+5806,convert_element_type_1505,call_function,convert_element_type.default,backward,9,1,1,1,5066,1085,6
+5807,alias_default_1090,call_function,alias.default,backward,9,1,1,2,5067,1084,4
+5808,einsum_default_455,call_function,einsum.default,backward,9,2,2,1,5068,3,5
+5809,permute_899,call_function,permute.default,backward,9,1,1,1,4,1080,3
+5810,einsum_default_456,call_function,einsum.default,backward,9,2,2,1,5069,1079,5
+5811,add_269,call_function,add.Tensor,unknown,,2,2,1,5074,1078,10
+5812,permute_900,call_function,permute.default,backward,9,1,1,1,5069,2,4
+5813,dtype_cast_421,call_function,dtype_cast.default,backward,9,1,1,1,5070,1,4
+5814,alias_default_1332,call_function,alias.default,backward,9,1,1,0,5071,0,3
+5815,convert_element_type_1510,call_function,convert_element_type.default,backward,9,1,1,1,5075,1077,8
+5816,convert_element_type_1511,call_function,convert_element_type.default,backward,9,1,1,1,1066,1077,4
+5817,convert_element_type_1512,call_function,convert_element_type.default,backward,9,1,1,1,3,1071,2
+5818,alias_default_1091,call_function,alias.default,backward,9,1,1,2,5076,1076,4
+5819,mul_570,call_function,mul.Tensor,backward,9,2,2,1,5078,1070,8
+5820,mul_571,call_function,mul.Tensor,backward,9,2,2,1,1074,1076,8
+5821,alias_default_1092,call_function,alias.default,backward,9,1,1,2,5079,1069,4
+5822,alias_default_1093,call_function,alias.default,backward,9,1,1,3,1075,1075,4
+5823,mul_572,call_function,mul.Tensor,backward,9,2,2,1,5083,1068,8
+5824,sum_75,call_function,sum.dim_IntList,backward,9,1,1,1,5084,1067,5
+5825,div_65,call_function,div.Tensor,backward,9,1,1,1,1076,1067,6
+5826,mul_573,call_function,mul.Tensor,backward,9,2,2,1,5086,1066,8
+5827,sub_56,call_function,sub.Tensor,backward,9,2,2,1,5087,1065,10
+5828,mul_574,call_function,mul.Tensor,backward,9,2,2,1,5088,1064,8
+5829,mul_575,call_function,mul.Tensor,backward,9,2,2,1,5080,4,8
+5830,sum_76,call_function,sum.dim_IntList,backward,9,1,1,1,5081,3,5
+5831,convert_element_type_1513,call_function,convert_element_type.default,backward,9,1,1,1,5089,1063,6
+5832,convert_element_type_1514,call_function,convert_element_type.default,backward,9,1,1,1,5082,2,3
+5833,add_270,call_function,add.Tensor,unknown,,2,2,1,5090,1062,10
+5834,dtype_cast_422,call_function,dtype_cast.default,backward,9,1,1,1,5083,1,3
+5835,alias_default_1336,call_function,alias.default,backward,9,1,1,0,5084,0,2
+5836,alias_default_1094,call_function,alias.default,unknown,,1,1,3,5091,1061,4
+5837,einsum_default_457,call_function,einsum.default,backward,9,2,2,1,5092,3,5
+5838,permute_903,call_function,permute.default,backward,9,1,1,1,4,1057,3
+5839,einsum_default_458,call_function,einsum.default,backward,9,2,2,1,5093,1056,5
+5840,permute_904,call_function,permute.default,backward,9,1,1,1,5093,2,4
+5841,dtype_cast_423,call_function,dtype_cast.default,backward,9,1,1,1,5094,1,4
+5842,alias_default_1331,call_function,alias.default,backward,9,1,1,0,5095,0,3
+5843,view_1052,call_function,view.default,backward,9,1,1,1,5094,1055,4
+5844,permute_905,call_function,permute.default,backward,9,1,1,1,5095,1054,4
+5845,_scaled_dot_product_flash_attention_backward_18,call_function,_scaled_dot_product_flash_attention_backward.default,backward,9,8,8,3,5099,1053,2
+5846,getitem_306,call_function,getitem,backward,9,1,1,1,5100,1026,2
+5847,getitem_307,call_function,getitem,backward,9,1,1,1,5100,1027,2
+5848,getitem_308,call_function,getitem,backward,9,1,1,1,5100,1020,2
+5849,permute_906,call_function,permute.default,backward,9,1,1,1,5101,1019,2
+5850,permute_907,call_function,permute.default,backward,9,1,1,1,5101,1026,2
+5851,permute_908,call_function,permute.default,backward,9,1,1,1,5101,1025,2
+5852,convert_element_type_1519,call_function,convert_element_type.default,backward,9,1,1,1,5102,1025,2
+5853,convert_element_type_1520,call_function,convert_element_type.default,backward,9,1,1,1,5102,1024,2
+5854,view_1053,call_function,view.default,backward,9,1,1,1,5103,1024,2
+5855,view_as_complex_92,call_function,view_as_complex.default,backward,9,1,1,1,5104,1023,6
+5856,_conj_36,call_function,_conj.default,backward,9,1,1,1,4,1024,3
+5857,clone_150,call_function,clone.default,backward,9,1,1,1,5,1023,3
+5858,mul_576,call_function,mul.Tensor,backward,9,2,2,1,5107,1022,8
+5859,view_1054,call_function,view.default,backward,9,1,1,1,5103,1023,2
+5860,view_as_complex_93,call_function,view_as_complex.default,backward,9,1,1,1,5104,1022,6
+5861,_conj_37,call_function,_conj.default,backward,9,1,1,1,4,1023,3
+5862,clone_151,call_function,clone.default,backward,9,1,1,1,5,1022,3
+5863,mul_577,call_function,mul.Tensor,backward,9,2,2,1,5107,1021,8
+5864,view_as_real_92,call_function,view_as_real.default,backward,9,1,1,1,5108,1021,6
+5865,view_1055,call_function,view.default,backward,9,1,1,1,5109,1020,6
+5866,convert_element_type_1521,call_function,convert_element_type.default,backward,9,1,1,1,5110,1019,6
+5867,view_as_real_93,call_function,view_as_real.default,backward,9,1,1,1,5108,1020,6
+5868,view_1056,call_function,view.default,backward,9,1,1,1,5109,1019,6
+5869,convert_element_type_1522,call_function,convert_element_type.default,backward,9,1,1,1,5110,1018,6
+5870,view_1057,call_function,view.default,backward,9,1,1,1,5102,1018,2
+5871,view_1058,call_function,view.default,backward,9,1,1,1,5111,1018,5
+5872,view_1059,call_function,view.default,backward,9,1,1,1,5111,1017,5
+5873,alias_default_1095,call_function,alias.default,backward,9,1,1,2,5103,1017,4
+5874,einsum_default_459,call_function,einsum.default,backward,9,2,2,1,5104,3,5
+5875,permute_911,call_function,permute.default,backward,9,1,1,1,4,1013,3
+5876,einsum_default_460,call_function,einsum.default,backward,9,2,2,1,5105,1012,5
+5877,permute_912,call_function,permute.default,backward,9,1,1,1,5105,2,4
+5878,dtype_cast_424,call_function,dtype_cast.default,backward,9,1,1,1,5106,1,4
+5879,alias_default_1330,call_function,alias.default,backward,9,1,1,0,5107,0,3
+5880,alias_default_1096,call_function,alias.default,backward,9,1,1,2,5112,1017,4
+5881,einsum_default_461,call_function,einsum.default,backward,9,2,2,1,5113,3,5
+5882,permute_915,call_function,permute.default,backward,9,1,1,1,4,1013,3
+5883,einsum_default_462,call_function,einsum.default,backward,9,2,2,1,5114,1012,5
+5884,add_271,call_function,add.Tensor,unknown,,2,2,1,5121,1011,10
+5885,permute_916,call_function,permute.default,backward,9,1,1,1,5114,2,4
+5886,dtype_cast_425,call_function,dtype_cast.default,backward,9,1,1,1,5115,1,4
+5887,alias_default_1329,call_function,alias.default,backward,9,1,1,0,5116,0,3
+5888,alias_default_1097,call_function,alias.default,backward,9,1,1,2,5112,1016,4
+5889,einsum_default_463,call_function,einsum.default,backward,9,2,2,1,5113,3,5
+5890,permute_919,call_function,permute.default,backward,9,1,1,1,4,1012,3
+5891,einsum_default_464,call_function,einsum.default,backward,9,2,2,1,5114,1011,5
+5892,add_272,call_function,add.Tensor,unknown,,2,2,1,5137,1010,10
+5893,permute_920,call_function,permute.default,backward,9,1,1,1,5114,2,4
+5894,dtype_cast_426,call_function,dtype_cast.default,backward,9,1,1,1,5115,1,4
+5895,alias_default_1328,call_function,alias.default,backward,9,1,1,0,5116,0,3
+5896,convert_element_type_1535,call_function,convert_element_type.default,backward,9,1,1,1,5138,1009,8
+5897,convert_element_type_1536,call_function,convert_element_type.default,backward,9,1,1,1,999,1009,4
+5898,convert_element_type_1537,call_function,convert_element_type.default,backward,9,1,1,1,3,1003,2
+5899,alias_default_1098,call_function,alias.default,backward,9,1,1,2,5139,1008,4
+5900,mul_578,call_function,mul.Tensor,backward,9,2,2,1,5141,1002,8
+5901,mul_579,call_function,mul.Tensor,backward,9,2,2,1,1007,1008,8
+5902,alias_default_1099,call_function,alias.default,backward,9,1,1,2,5142,1001,4
+5903,alias_default_1100,call_function,alias.default,backward,9,1,1,3,1008,1007,4
+5904,mul_580,call_function,mul.Tensor,backward,9,2,2,1,5146,1000,8
+5905,sum_77,call_function,sum.dim_IntList,backward,9,1,1,1,5147,999,5
+5906,div_66,call_function,div.Tensor,backward,9,1,1,1,1009,999,6
+5907,mul_581,call_function,mul.Tensor,backward,9,2,2,1,5149,998,8
+5908,sub_57,call_function,sub.Tensor,backward,9,2,2,1,5150,997,10
+5909,mul_582,call_function,mul.Tensor,backward,9,2,2,1,5151,996,8
+5910,mul_583,call_function,mul.Tensor,backward,9,2,2,1,5143,4,8
+5911,sum_78,call_function,sum.dim_IntList,backward,9,1,1,1,5144,3,5
+5912,convert_element_type_1538,call_function,convert_element_type.default,backward,9,1,1,1,5152,995,6
+5913,convert_element_type_1539,call_function,convert_element_type.default,backward,9,1,1,1,5145,2,3
+5914,add_273,call_function,add.Tensor,unknown,,2,2,1,5153,994,10
+5915,dtype_cast_427,call_function,dtype_cast.default,backward,9,1,1,1,5146,1,3
+5916,alias_default_1335,call_function,alias.default,backward,9,1,1,0,5147,0,2
+5917,alias_default_1101,call_function,alias.default,unknown,,1,1,3,5154,993,4
+5918,einsum_default_465,call_function,einsum.default,backward,8,2,2,1,5155,3,5
+5919,permute_923,call_function,permute.default,backward,8,1,1,1,4,989,3
+5920,einsum_default_466,call_function,einsum.default,backward,8,2,2,1,5156,988,5
+5921,permute_924,call_function,permute.default,backward,8,1,1,1,5156,2,4
+5922,dtype_cast_428,call_function,dtype_cast.default,backward,8,1,1,1,5157,1,4
+5923,alias_default_1324,call_function,alias.default,backward,8,1,1,0,5158,0,3
+5924,alias_default_1102,call_function,alias.default,backward,8,1,1,2,5157,987,4
+5925,mul_584,call_function,mul.Tensor,backward,8,2,2,1,5158,975,8
+5926,mul_585,call_function,mul.Tensor,backward,8,2,2,1,5158,979,8
+5927,alias_default_1103,call_function,alias.default,backward,8,1,1,2,5159,974,4
+5928,einsum_default_467,call_function,einsum.default,backward,8,2,2,1,5160,3,5
+5929,permute_927,call_function,permute.default,backward,8,1,1,1,4,970,3
+5930,einsum_default_468,call_function,einsum.default,backward,8,2,2,1,5161,969,5
+5931,permute_928,call_function,permute.default,backward,8,1,1,1,5161,2,4
+5932,dtype_cast_429,call_function,dtype_cast.default,backward,8,1,1,1,5162,1,4
+5933,alias_default_1325,call_function,alias.default,backward,8,1,1,0,5163,0,3
+5934,convert_element_type_1548,call_function,convert_element_type.default,backward,8,1,1,1,5159,978,6
+5935,convert_element_type_1549,call_function,convert_element_type.default,backward,8,1,1,1,976,988,4
+5936,alias_default_1104,call_function,alias.default,backward,8,1,1,2,977,987,4
+5937,neg_47,call_function,neg.default,backward,8,1,1,1,978,986,8
+5938,exp_47,call_function,exp.default,backward,8,1,1,1,979,985,6
+5939,add_274,call_function,add.Tensor,backward,8,1,1,1,980,984,4
+5940,reciprocal_19,call_function,reciprocal.default,backward,8,1,1,1,981,983,4
+5941,mul_586,call_function,mul.Tensor,backward,8,1,1,1,982,982,6
+5942,alias_default_1105,call_function,alias.default,backward,8,1,1,2,983,981,4
+5943,mul_587,call_function,mul.Tensor,backward,8,2,2,1,5168,977,8
+5944,sub_58,call_function,sub.Tensor,backward,8,1,1,1,984,979,4
+5945,mul_588,call_function,mul.Tensor,backward,8,2,2,1,985,978,8
+5946,add_275,call_function,add.Tensor,backward,8,1,1,1,986,977,4
+5947,mul_589,call_function,mul.Tensor,backward,8,2,2,1,5172,976,8
+5948,convert_element_type_1550,call_function,convert_element_type.default,backward,8,1,1,1,5173,975,6
+5949,alias_default_1106,call_function,alias.default,backward,8,1,1,2,5174,974,4
+5950,einsum_default_469,call_function,einsum.default,backward,8,2,2,1,5175,3,5
+5951,permute_931,call_function,permute.default,backward,8,1,1,1,4,970,3
+5952,einsum_default_470,call_function,einsum.default,backward,8,2,2,1,5176,969,5
+5953,add_276,call_function,add.Tensor,unknown,,2,2,1,5181,968,10
+5954,permute_932,call_function,permute.default,backward,8,1,1,1,5176,2,4
+5955,dtype_cast_430,call_function,dtype_cast.default,backward,8,1,1,1,5177,1,4
+5956,alias_default_1323,call_function,alias.default,backward,8,1,1,0,5178,0,3
+5957,convert_element_type_1555,call_function,convert_element_type.default,backward,8,1,1,1,5182,967,8
+5958,convert_element_type_1556,call_function,convert_element_type.default,backward,8,1,1,1,956,967,4
+5959,convert_element_type_1557,call_function,convert_element_type.default,backward,8,1,1,1,3,961,2
+5960,alias_default_1107,call_function,alias.default,backward,8,1,1,2,5183,966,4
+5961,mul_590,call_function,mul.Tensor,backward,8,2,2,1,5185,960,8
+5962,mul_591,call_function,mul.Tensor,backward,8,2,2,1,964,966,8
+5963,alias_default_1108,call_function,alias.default,backward,8,1,1,2,5186,959,4
+5964,alias_default_1109,call_function,alias.default,backward,8,1,1,3,965,965,4
+5965,mul_592,call_function,mul.Tensor,backward,8,2,2,1,5190,958,8
+5966,sum_79,call_function,sum.dim_IntList,backward,8,1,1,1,5191,957,5
+5967,div_67,call_function,div.Tensor,backward,8,1,1,1,966,957,6
+5968,mul_593,call_function,mul.Tensor,backward,8,2,2,1,5193,956,8
+5969,sub_59,call_function,sub.Tensor,backward,8,2,2,1,5194,955,10
+5970,mul_594,call_function,mul.Tensor,backward,8,2,2,1,5195,954,8
+5971,mul_595,call_function,mul.Tensor,backward,8,2,2,1,5187,4,8
+5972,sum_80,call_function,sum.dim_IntList,backward,8,1,1,1,5188,3,5
+5973,convert_element_type_1558,call_function,convert_element_type.default,backward,8,1,1,1,5196,953,6
+5974,convert_element_type_1559,call_function,convert_element_type.default,backward,8,1,1,1,5189,2,3
+5975,add_277,call_function,add.Tensor,unknown,,2,2,1,5197,952,10
+5976,dtype_cast_431,call_function,dtype_cast.default,backward,8,1,1,1,5190,1,3
+5977,alias_default_1327,call_function,alias.default,backward,8,1,1,0,5191,0,2
+5978,alias_default_1110,call_function,alias.default,unknown,,1,1,3,5198,951,4
+5979,einsum_default_471,call_function,einsum.default,backward,8,2,2,1,5199,3,5
+5980,permute_935,call_function,permute.default,backward,8,1,1,1,4,947,3
+5981,einsum_default_472,call_function,einsum.default,backward,8,2,2,1,5200,946,5
+5982,permute_936,call_function,permute.default,backward,8,1,1,1,5200,2,4
+5983,dtype_cast_432,call_function,dtype_cast.default,backward,8,1,1,1,5201,1,4
+5984,alias_default_1322,call_function,alias.default,backward,8,1,1,0,5202,0,3
+5985,view_1074,call_function,view.default,backward,8,1,1,1,5201,945,4
+5986,permute_937,call_function,permute.default,backward,8,1,1,1,5202,944,4
+5987,_scaled_dot_product_flash_attention_backward_19,call_function,_scaled_dot_product_flash_attention_backward.default,backward,8,8,8,3,5206,943,2
+5988,getitem_309,call_function,getitem,backward,8,1,1,1,5207,916,2
+5989,getitem_310,call_function,getitem,backward,8,1,1,1,5207,917,2
+5990,getitem_311,call_function,getitem,backward,8,1,1,1,5207,910,2
+5991,permute_938,call_function,permute.default,backward,8,1,1,1,5208,909,2
+5992,permute_939,call_function,permute.default,backward,8,1,1,1,5208,916,2
+5993,permute_940,call_function,permute.default,backward,8,1,1,1,5208,915,2
+5994,convert_element_type_1564,call_function,convert_element_type.default,backward,8,1,1,1,5209,915,2
+5995,convert_element_type_1565,call_function,convert_element_type.default,backward,8,1,1,1,5209,914,2
+5996,view_1075,call_function,view.default,backward,8,1,1,1,5210,914,2
+5997,view_as_complex_94,call_function,view_as_complex.default,backward,8,1,1,1,5211,913,6
+5998,_conj_38,call_function,_conj.default,backward,8,1,1,1,4,914,3
+5999,clone_158,call_function,clone.default,backward,8,1,1,1,5,913,3
+6000,mul_596,call_function,mul.Tensor,backward,8,2,2,1,5214,912,8
+6001,view_1076,call_function,view.default,backward,8,1,1,1,5210,913,2
+6002,view_as_complex_95,call_function,view_as_complex.default,backward,8,1,1,1,5211,912,6
+6003,_conj_39,call_function,_conj.default,backward,8,1,1,1,4,913,3
+6004,clone_159,call_function,clone.default,backward,8,1,1,1,5,912,3
+6005,mul_597,call_function,mul.Tensor,backward,8,2,2,1,5214,911,8
+6006,view_as_real_94,call_function,view_as_real.default,backward,8,1,1,1,5215,911,6
+6007,view_1077,call_function,view.default,backward,8,1,1,1,5216,910,6
+6008,convert_element_type_1566,call_function,convert_element_type.default,backward,8,1,1,1,5217,909,6
+6009,view_as_real_95,call_function,view_as_real.default,backward,8,1,1,1,5215,910,6
+6010,view_1078,call_function,view.default,backward,8,1,1,1,5216,909,6
+6011,convert_element_type_1567,call_function,convert_element_type.default,backward,8,1,1,1,5217,908,6
+6012,view_1079,call_function,view.default,backward,8,1,1,1,5209,908,2
+6013,view_1080,call_function,view.default,backward,8,1,1,1,5218,908,5
+6014,view_1081,call_function,view.default,backward,8,1,1,1,5218,907,5
+6015,alias_default_1111,call_function,alias.default,backward,8,1,1,2,5210,907,4
+6016,einsum_default_473,call_function,einsum.default,backward,8,2,2,1,5211,3,5
+6017,permute_943,call_function,permute.default,backward,8,1,1,1,4,903,3
+6018,einsum_default_474,call_function,einsum.default,backward,8,2,2,1,5212,902,5
+6019,permute_944,call_function,permute.default,backward,8,1,1,1,5212,2,4
+6020,dtype_cast_433,call_function,dtype_cast.default,backward,8,1,1,1,5213,1,4
+6021,alias_default_1321,call_function,alias.default,backward,8,1,1,0,5214,0,3
+6022,alias_default_1112,call_function,alias.default,backward,8,1,1,2,5219,907,4
+6023,einsum_default_475,call_function,einsum.default,backward,8,2,2,1,5220,3,5
+6024,permute_947,call_function,permute.default,backward,8,1,1,1,4,903,3
+6025,einsum_default_476,call_function,einsum.default,backward,8,2,2,1,5221,902,5
+6026,add_278,call_function,add.Tensor,unknown,,2,2,1,5228,901,10
+6027,permute_948,call_function,permute.default,backward,8,1,1,1,5221,2,4
+6028,dtype_cast_434,call_function,dtype_cast.default,backward,8,1,1,1,5222,1,4
+6029,alias_default_1320,call_function,alias.default,backward,8,1,1,0,5223,0,3
+6030,alias_default_1113,call_function,alias.default,backward,8,1,1,2,5219,906,4
+6031,einsum_default_477,call_function,einsum.default,backward,8,2,2,1,5220,3,5
+6032,permute_951,call_function,permute.default,backward,8,1,1,1,4,902,3
+6033,einsum_default_478,call_function,einsum.default,backward,8,2,2,1,5221,901,5
+6034,add_279,call_function,add.Tensor,unknown,,2,2,1,5244,900,10
+6035,permute_952,call_function,permute.default,backward,8,1,1,1,5221,2,4
+6036,dtype_cast_435,call_function,dtype_cast.default,backward,8,1,1,1,5222,1,4
+6037,alias_default_1319,call_function,alias.default,backward,8,1,1,0,5223,0,3
+6038,convert_element_type_1580,call_function,convert_element_type.default,backward,8,1,1,1,5245,899,8
+6039,convert_element_type_1581,call_function,convert_element_type.default,backward,8,1,1,1,889,899,4
+6040,convert_element_type_1582,call_function,convert_element_type.default,backward,8,1,1,1,3,893,2
+6041,alias_default_1114,call_function,alias.default,backward,8,1,1,2,5246,898,4
+6042,mul_598,call_function,mul.Tensor,backward,8,2,2,1,5248,892,8
+6043,mul_599,call_function,mul.Tensor,backward,8,2,2,1,897,898,8
+6044,alias_default_1115,call_function,alias.default,backward,8,1,1,2,5249,891,4
+6045,alias_default_1116,call_function,alias.default,backward,8,1,1,3,898,897,4
+6046,mul_600,call_function,mul.Tensor,backward,8,2,2,1,5253,890,8
+6047,sum_81,call_function,sum.dim_IntList,backward,8,1,1,1,5254,889,5
+6048,div_68,call_function,div.Tensor,backward,8,1,1,1,899,889,6
+6049,mul_601,call_function,mul.Tensor,backward,8,2,2,1,5256,888,8
+6050,sub_60,call_function,sub.Tensor,backward,8,2,2,1,5257,887,10
+6051,mul_602,call_function,mul.Tensor,backward,8,2,2,1,5258,886,8
+6052,mul_603,call_function,mul.Tensor,backward,8,2,2,1,5250,4,8
+6053,sum_82,call_function,sum.dim_IntList,backward,8,1,1,1,5251,3,5
+6054,convert_element_type_1583,call_function,convert_element_type.default,backward,8,1,1,1,5259,885,6
+6055,convert_element_type_1584,call_function,convert_element_type.default,backward,8,1,1,1,5252,2,3
+6056,add_280,call_function,add.Tensor,unknown,,2,2,1,5260,884,10
+6057,dtype_cast_436,call_function,dtype_cast.default,backward,8,1,1,1,5253,1,3
+6058,alias_default_1326,call_function,alias.default,backward,8,1,1,0,5254,0,2
+6059,alias_default_1117,call_function,alias.default,unknown,,1,1,3,5261,883,4
+6060,einsum_default_479,call_function,einsum.default,backward,7,2,2,1,5262,3,5
+6061,permute_955,call_function,permute.default,backward,7,1,1,1,4,879,3
+6062,einsum_default_480,call_function,einsum.default,backward,7,2,2,1,5263,878,5
+6063,permute_956,call_function,permute.default,backward,7,1,1,1,5263,2,4
+6064,dtype_cast_437,call_function,dtype_cast.default,backward,7,1,1,1,5264,1,4
+6065,alias_default_1315,call_function,alias.default,backward,7,1,1,0,5265,0,3
+6066,alias_default_1118,call_function,alias.default,backward,7,1,1,2,5264,877,4
+6067,mul_604,call_function,mul.Tensor,backward,7,2,2,1,5265,865,8
+6068,mul_605,call_function,mul.Tensor,backward,7,2,2,1,5265,869,8
+6069,alias_default_1119,call_function,alias.default,backward,7,1,1,2,5266,864,4
+6070,einsum_default_481,call_function,einsum.default,backward,7,2,2,1,5267,3,5
+6071,permute_959,call_function,permute.default,backward,7,1,1,1,4,860,3
+6072,einsum_default_482,call_function,einsum.default,backward,7,2,2,1,5268,859,5
+6073,permute_960,call_function,permute.default,backward,7,1,1,1,5268,2,4
+6074,dtype_cast_438,call_function,dtype_cast.default,backward,7,1,1,1,5269,1,4
+6075,alias_default_1316,call_function,alias.default,backward,7,1,1,0,5270,0,3
+6076,convert_element_type_1593,call_function,convert_element_type.default,backward,7,1,1,1,5266,868,6
+6077,convert_element_type_1594,call_function,convert_element_type.default,backward,7,1,1,1,866,878,4
+6078,alias_default_1120,call_function,alias.default,backward,7,1,1,2,867,877,4
+6079,neg_48,call_function,neg.default,backward,7,1,1,1,868,876,8
+6080,exp_48,call_function,exp.default,backward,7,1,1,1,869,875,6
+6081,add_281,call_function,add.Tensor,backward,7,1,1,1,870,874,4
+6082,reciprocal_20,call_function,reciprocal.default,backward,7,1,1,1,871,873,4
+6083,mul_606,call_function,mul.Tensor,backward,7,1,1,1,872,872,6
+6084,alias_default_1121,call_function,alias.default,backward,7,1,1,2,873,871,4
+6085,mul_607,call_function,mul.Tensor,backward,7,2,2,1,5275,867,8
+6086,sub_61,call_function,sub.Tensor,backward,7,1,1,1,874,869,4
+6087,mul_608,call_function,mul.Tensor,backward,7,2,2,1,875,868,8
+6088,add_282,call_function,add.Tensor,backward,7,1,1,1,876,867,4
+6089,mul_609,call_function,mul.Tensor,backward,7,2,2,1,5279,866,8
+6090,convert_element_type_1595,call_function,convert_element_type.default,backward,7,1,1,1,5280,865,6
+6091,alias_default_1122,call_function,alias.default,backward,7,1,1,2,5281,864,4
+6092,einsum_default_483,call_function,einsum.default,backward,7,2,2,1,5282,3,5
+6093,permute_963,call_function,permute.default,backward,7,1,1,1,4,860,3
+6094,einsum_default_484,call_function,einsum.default,backward,7,2,2,1,5283,859,5
+6095,add_283,call_function,add.Tensor,unknown,,2,2,1,5288,858,10
+6096,permute_964,call_function,permute.default,backward,7,1,1,1,5283,2,4
+6097,dtype_cast_439,call_function,dtype_cast.default,backward,7,1,1,1,5284,1,4
+6098,alias_default_1314,call_function,alias.default,backward,7,1,1,0,5285,0,3
+6099,convert_element_type_1600,call_function,convert_element_type.default,backward,7,1,1,1,5289,857,8
+6100,convert_element_type_1601,call_function,convert_element_type.default,backward,7,1,1,1,846,857,4
+6101,convert_element_type_1602,call_function,convert_element_type.default,backward,7,1,1,1,3,851,2
+6102,alias_default_1123,call_function,alias.default,backward,7,1,1,2,5290,856,4
+6103,mul_610,call_function,mul.Tensor,backward,7,2,2,1,5292,850,8
+6104,mul_611,call_function,mul.Tensor,backward,7,2,2,1,854,856,8
+6105,alias_default_1124,call_function,alias.default,backward,7,1,1,2,5293,849,4
+6106,alias_default_1125,call_function,alias.default,backward,7,1,1,3,855,855,4
+6107,mul_612,call_function,mul.Tensor,backward,7,2,2,1,5297,848,8
+6108,sum_83,call_function,sum.dim_IntList,backward,7,1,1,1,5298,847,5
+6109,div_69,call_function,div.Tensor,backward,7,1,1,1,856,847,6
+6110,mul_613,call_function,mul.Tensor,backward,7,2,2,1,5300,846,8
+6111,sub_62,call_function,sub.Tensor,backward,7,2,2,1,5301,845,10
+6112,mul_614,call_function,mul.Tensor,backward,7,2,2,1,5302,844,8
+6113,mul_615,call_function,mul.Tensor,backward,7,2,2,1,5294,4,8
+6114,sum_84,call_function,sum.dim_IntList,backward,7,1,1,1,5295,3,5
+6115,convert_element_type_1603,call_function,convert_element_type.default,backward,7,1,1,1,5303,843,6
+6116,convert_element_type_1604,call_function,convert_element_type.default,backward,7,1,1,1,5296,2,3
+6117,add_284,call_function,add.Tensor,unknown,,2,2,1,5304,842,10
+6118,dtype_cast_440,call_function,dtype_cast.default,backward,7,1,1,1,5297,1,3
+6119,alias_default_1318,call_function,alias.default,backward,7,1,1,0,5298,0,2
+6120,alias_default_1126,call_function,alias.default,unknown,,1,1,3,5305,841,4
+6121,einsum_default_485,call_function,einsum.default,backward,7,2,2,1,5306,3,5
+6122,permute_967,call_function,permute.default,backward,7,1,1,1,4,837,3
+6123,einsum_default_486,call_function,einsum.default,backward,7,2,2,1,5307,836,5
+6124,permute_968,call_function,permute.default,backward,7,1,1,1,5307,2,4
+6125,dtype_cast_441,call_function,dtype_cast.default,backward,7,1,1,1,5308,1,4
+6126,alias_default_1313,call_function,alias.default,backward,7,1,1,0,5309,0,3
+6127,view_1096,call_function,view.default,backward,7,1,1,1,5308,835,4
+6128,permute_969,call_function,permute.default,backward,7,1,1,1,5309,834,4
+6129,_scaled_dot_product_flash_attention_backward_20,call_function,_scaled_dot_product_flash_attention_backward.default,backward,7,8,8,3,5313,833,2
+6130,getitem_312,call_function,getitem,backward,7,1,1,1,5314,806,2
+6131,getitem_313,call_function,getitem,backward,7,1,1,1,5314,807,2
+6132,getitem_314,call_function,getitem,backward,7,1,1,1,5314,800,2
+6133,permute_970,call_function,permute.default,backward,7,1,1,1,5315,799,2
+6134,permute_971,call_function,permute.default,backward,7,1,1,1,5315,806,2
+6135,permute_972,call_function,permute.default,backward,7,1,1,1,5315,805,2
+6136,convert_element_type_1609,call_function,convert_element_type.default,backward,7,1,1,1,5316,805,2
+6137,convert_element_type_1610,call_function,convert_element_type.default,backward,7,1,1,1,5316,804,2
+6138,view_1097,call_function,view.default,backward,7,1,1,1,5317,804,2
+6139,view_as_complex_96,call_function,view_as_complex.default,backward,7,1,1,1,5318,803,6
+6140,_conj_40,call_function,_conj.default,backward,7,1,1,1,4,804,3
+6141,clone_166,call_function,clone.default,backward,7,1,1,1,5,803,3
+6142,mul_616,call_function,mul.Tensor,backward,7,2,2,1,5321,802,8
+6143,view_1098,call_function,view.default,backward,7,1,1,1,5317,803,2
+6144,view_as_complex_97,call_function,view_as_complex.default,backward,7,1,1,1,5318,802,6
+6145,_conj_41,call_function,_conj.default,backward,7,1,1,1,4,803,3
+6146,clone_167,call_function,clone.default,backward,7,1,1,1,5,802,3
+6147,mul_617,call_function,mul.Tensor,backward,7,2,2,1,5321,801,8
+6148,view_as_real_96,call_function,view_as_real.default,backward,7,1,1,1,5322,801,6
+6149,view_1099,call_function,view.default,backward,7,1,1,1,5323,800,6
+6150,convert_element_type_1611,call_function,convert_element_type.default,backward,7,1,1,1,5324,799,6
+6151,view_as_real_97,call_function,view_as_real.default,backward,7,1,1,1,5322,800,6
+6152,view_1100,call_function,view.default,backward,7,1,1,1,5323,799,6
+6153,convert_element_type_1612,call_function,convert_element_type.default,backward,7,1,1,1,5324,798,6
+6154,view_1101,call_function,view.default,backward,7,1,1,1,5316,798,2
+6155,view_1102,call_function,view.default,backward,7,1,1,1,5325,798,5
+6156,view_1103,call_function,view.default,backward,7,1,1,1,5325,797,5
+6157,alias_default_1127,call_function,alias.default,backward,7,1,1,2,5317,797,4
+6158,einsum_default_487,call_function,einsum.default,backward,7,2,2,1,5318,3,5
+6159,permute_975,call_function,permute.default,backward,7,1,1,1,4,793,3
+6160,einsum_default_488,call_function,einsum.default,backward,7,2,2,1,5319,792,5
+6161,permute_976,call_function,permute.default,backward,7,1,1,1,5319,2,4
+6162,dtype_cast_442,call_function,dtype_cast.default,backward,7,1,1,1,5320,1,4
+6163,alias_default_1312,call_function,alias.default,backward,7,1,1,0,5321,0,3
+6164,alias_default_1128,call_function,alias.default,backward,7,1,1,2,5326,797,4
+6165,einsum_default_489,call_function,einsum.default,backward,7,2,2,1,5327,3,5
+6166,permute_979,call_function,permute.default,backward,7,1,1,1,4,793,3
+6167,einsum_default_490,call_function,einsum.default,backward,7,2,2,1,5328,792,5
+6168,add_285,call_function,add.Tensor,unknown,,2,2,1,5335,791,10
+6169,permute_980,call_function,permute.default,backward,7,1,1,1,5328,2,4
+6170,dtype_cast_443,call_function,dtype_cast.default,backward,7,1,1,1,5329,1,4
+6171,alias_default_1311,call_function,alias.default,backward,7,1,1,0,5330,0,3
+6172,alias_default_1129,call_function,alias.default,backward,7,1,1,2,5326,796,4
+6173,einsum_default_491,call_function,einsum.default,backward,7,2,2,1,5327,3,5
+6174,permute_983,call_function,permute.default,backward,7,1,1,1,4,792,3
+6175,einsum_default_492,call_function,einsum.default,backward,7,2,2,1,5328,791,5
+6176,add_286,call_function,add.Tensor,unknown,,2,2,1,5351,790,10
+6177,permute_984,call_function,permute.default,backward,7,1,1,1,5328,2,4
+6178,dtype_cast_444,call_function,dtype_cast.default,backward,7,1,1,1,5329,1,4
+6179,alias_default_1310,call_function,alias.default,backward,7,1,1,0,5330,0,3
+6180,convert_element_type_1625,call_function,convert_element_type.default,backward,7,1,1,1,5352,789,8
+6181,convert_element_type_1626,call_function,convert_element_type.default,backward,7,1,1,1,779,789,4
+6182,convert_element_type_1627,call_function,convert_element_type.default,backward,7,1,1,1,3,783,2
+6183,alias_default_1130,call_function,alias.default,backward,7,1,1,2,5353,788,4
+6184,mul_618,call_function,mul.Tensor,backward,7,2,2,1,5355,782,8
+6185,mul_619,call_function,mul.Tensor,backward,7,2,2,1,787,788,8
+6186,alias_default_1131,call_function,alias.default,backward,7,1,1,2,5356,781,4
+6187,alias_default_1132,call_function,alias.default,backward,7,1,1,3,788,787,4
+6188,mul_620,call_function,mul.Tensor,backward,7,2,2,1,5360,780,8
+6189,sum_85,call_function,sum.dim_IntList,backward,7,1,1,1,5361,779,5
+6190,div_70,call_function,div.Tensor,backward,7,1,1,1,789,779,6
+6191,mul_621,call_function,mul.Tensor,backward,7,2,2,1,5363,778,8
+6192,sub_63,call_function,sub.Tensor,backward,7,2,2,1,5364,777,10
+6193,mul_622,call_function,mul.Tensor,backward,7,2,2,1,5365,776,8
+6194,mul_623,call_function,mul.Tensor,backward,7,2,2,1,5357,4,8
+6195,sum_86,call_function,sum.dim_IntList,backward,7,1,1,1,5358,3,5
+6196,convert_element_type_1628,call_function,convert_element_type.default,backward,7,1,1,1,5366,775,6
+6197,convert_element_type_1629,call_function,convert_element_type.default,backward,7,1,1,1,5359,2,3
+6198,add_287,call_function,add.Tensor,unknown,,2,2,1,5367,774,10
+6199,dtype_cast_445,call_function,dtype_cast.default,backward,7,1,1,1,5360,1,3
+6200,alias_default_1317,call_function,alias.default,backward,7,1,1,0,5361,0,2
+6201,alias_default_1133,call_function,alias.default,unknown,,1,1,3,5368,773,4
+6202,einsum_default_493,call_function,einsum.default,backward,6,2,2,1,5369,3,5
+6203,permute_987,call_function,permute.default,backward,6,1,1,1,4,769,3
+6204,einsum_default_494,call_function,einsum.default,backward,6,2,2,1,5370,768,5
+6205,permute_988,call_function,permute.default,backward,6,1,1,1,5370,2,4
+6206,dtype_cast_446,call_function,dtype_cast.default,backward,6,1,1,1,5371,1,4
+6207,alias_default_1306,call_function,alias.default,backward,6,1,1,0,5372,0,3
+6208,alias_default_1134,call_function,alias.default,backward,6,1,1,2,5371,767,4
+6209,mul_624,call_function,mul.Tensor,backward,6,2,2,1,5372,755,8
+6210,mul_625,call_function,mul.Tensor,backward,6,2,2,1,5372,759,8
+6211,alias_default_1135,call_function,alias.default,backward,6,1,1,2,5373,754,4
+6212,einsum_default_495,call_function,einsum.default,backward,6,2,2,1,5374,3,5
+6213,permute_991,call_function,permute.default,backward,6,1,1,1,4,750,3
+6214,einsum_default_496,call_function,einsum.default,backward,6,2,2,1,5375,749,5
+6215,permute_992,call_function,permute.default,backward,6,1,1,1,5375,2,4
+6216,dtype_cast_447,call_function,dtype_cast.default,backward,6,1,1,1,5376,1,4
+6217,alias_default_1307,call_function,alias.default,backward,6,1,1,0,5377,0,3
+6218,convert_element_type_1638,call_function,convert_element_type.default,backward,6,1,1,1,5373,758,6
+6219,convert_element_type_1639,call_function,convert_element_type.default,backward,6,1,1,1,756,768,4
+6220,alias_default_1136,call_function,alias.default,backward,6,1,1,2,757,767,4
+6221,neg_49,call_function,neg.default,backward,6,1,1,1,758,766,8
+6222,exp_49,call_function,exp.default,backward,6,1,1,1,759,765,6
+6223,add_288,call_function,add.Tensor,backward,6,1,1,1,760,764,4
+6224,reciprocal_21,call_function,reciprocal.default,backward,6,1,1,1,761,763,4
+6225,mul_626,call_function,mul.Tensor,backward,6,1,1,1,762,762,6
+6226,alias_default_1137,call_function,alias.default,backward,6,1,1,2,763,761,4
+6227,mul_627,call_function,mul.Tensor,backward,6,2,2,1,5382,757,8
+6228,sub_64,call_function,sub.Tensor,backward,6,1,1,1,764,759,4
+6229,mul_628,call_function,mul.Tensor,backward,6,2,2,1,765,758,8
+6230,add_289,call_function,add.Tensor,backward,6,1,1,1,766,757,4
+6231,mul_629,call_function,mul.Tensor,backward,6,2,2,1,5386,756,8
+6232,convert_element_type_1640,call_function,convert_element_type.default,backward,6,1,1,1,5387,755,6
+6233,alias_default_1138,call_function,alias.default,backward,6,1,1,2,5388,754,4
+6234,einsum_default_497,call_function,einsum.default,backward,6,2,2,1,5389,3,5
+6235,permute_995,call_function,permute.default,backward,6,1,1,1,4,750,3
+6236,einsum_default_498,call_function,einsum.default,backward,6,2,2,1,5390,749,5
+6237,add_290,call_function,add.Tensor,unknown,,2,2,1,5395,748,10
+6238,permute_996,call_function,permute.default,backward,6,1,1,1,5390,2,4
+6239,dtype_cast_448,call_function,dtype_cast.default,backward,6,1,1,1,5391,1,4
+6240,alias_default_1305,call_function,alias.default,backward,6,1,1,0,5392,0,3
+6241,convert_element_type_1645,call_function,convert_element_type.default,backward,6,1,1,1,5396,747,8
+6242,convert_element_type_1646,call_function,convert_element_type.default,backward,6,1,1,1,736,747,4
+6243,convert_element_type_1647,call_function,convert_element_type.default,backward,6,1,1,1,3,741,2
+6244,alias_default_1139,call_function,alias.default,backward,6,1,1,2,5397,746,4
+6245,mul_630,call_function,mul.Tensor,backward,6,2,2,1,5399,740,8
+6246,mul_631,call_function,mul.Tensor,backward,6,2,2,1,744,746,8
+6247,alias_default_1140,call_function,alias.default,backward,6,1,1,2,5400,739,4
+6248,alias_default_1141,call_function,alias.default,backward,6,1,1,3,745,745,4
+6249,mul_632,call_function,mul.Tensor,backward,6,2,2,1,5404,738,8
+6250,sum_87,call_function,sum.dim_IntList,backward,6,1,1,1,5405,737,5
+6251,div_71,call_function,div.Tensor,backward,6,1,1,1,746,737,6
+6252,mul_633,call_function,mul.Tensor,backward,6,2,2,1,5407,736,8
+6253,sub_65,call_function,sub.Tensor,backward,6,2,2,1,5408,735,10
+6254,mul_634,call_function,mul.Tensor,backward,6,2,2,1,5409,734,8
+6255,mul_635,call_function,mul.Tensor,backward,6,2,2,1,5401,4,8
+6256,sum_88,call_function,sum.dim_IntList,backward,6,1,1,1,5402,3,5
+6257,convert_element_type_1648,call_function,convert_element_type.default,backward,6,1,1,1,5410,733,6
+6258,convert_element_type_1649,call_function,convert_element_type.default,backward,6,1,1,1,5403,2,3
+6259,add_291,call_function,add.Tensor,unknown,,2,2,1,5411,732,10
+6260,dtype_cast_449,call_function,dtype_cast.default,backward,6,1,1,1,5404,1,3
+6261,alias_default_1309,call_function,alias.default,backward,6,1,1,0,5405,0,2
+6262,alias_default_1142,call_function,alias.default,unknown,,1,1,3,5412,731,4
+6263,einsum_default_499,call_function,einsum.default,backward,6,2,2,1,5413,3,5
+6264,permute_999,call_function,permute.default,backward,6,1,1,1,4,727,3
+6265,einsum_default_500,call_function,einsum.default,backward,6,2,2,1,5414,726,5
+6266,permute_1000,call_function,permute.default,backward,6,1,1,1,5414,2,4
+6267,dtype_cast_450,call_function,dtype_cast.default,backward,6,1,1,1,5415,1,4
+6268,alias_default_1304,call_function,alias.default,backward,6,1,1,0,5416,0,3
+6269,view_1118,call_function,view.default,backward,6,1,1,1,5415,725,4
+6270,permute_1001,call_function,permute.default,backward,6,1,1,1,5416,724,4
+6271,_scaled_dot_product_flash_attention_backward_21,call_function,_scaled_dot_product_flash_attention_backward.default,backward,6,8,8,3,5420,723,2
+6272,getitem_315,call_function,getitem,backward,6,1,1,1,5421,696,2
+6273,getitem_316,call_function,getitem,backward,6,1,1,1,5421,697,2
+6274,getitem_317,call_function,getitem,backward,6,1,1,1,5421,690,2
+6275,permute_1002,call_function,permute.default,backward,6,1,1,1,5422,689,2
+6276,permute_1003,call_function,permute.default,backward,6,1,1,1,5422,696,2
+6277,permute_1004,call_function,permute.default,backward,6,1,1,1,5422,695,2
+6278,convert_element_type_1654,call_function,convert_element_type.default,backward,6,1,1,1,5423,695,2
+6279,convert_element_type_1655,call_function,convert_element_type.default,backward,6,1,1,1,5423,694,2
+6280,view_1119,call_function,view.default,backward,6,1,1,1,5424,694,2
+6281,view_as_complex_98,call_function,view_as_complex.default,backward,6,1,1,1,5425,693,6
+6282,_conj_42,call_function,_conj.default,backward,6,1,1,1,4,694,3
+6283,clone_174,call_function,clone.default,backward,6,1,1,1,5,693,3
+6284,mul_636,call_function,mul.Tensor,backward,6,2,2,1,5428,692,8
+6285,view_1120,call_function,view.default,backward,6,1,1,1,5424,693,2
+6286,view_as_complex_99,call_function,view_as_complex.default,backward,6,1,1,1,5425,692,6
+6287,_conj_43,call_function,_conj.default,backward,6,1,1,1,4,693,3
+6288,clone_175,call_function,clone.default,backward,6,1,1,1,5,692,3
+6289,mul_637,call_function,mul.Tensor,backward,6,2,2,1,5428,691,8
+6290,view_as_real_98,call_function,view_as_real.default,backward,6,1,1,1,5429,691,6
+6291,view_1121,call_function,view.default,backward,6,1,1,1,5430,690,6
+6292,convert_element_type_1656,call_function,convert_element_type.default,backward,6,1,1,1,5431,689,6
+6293,view_as_real_99,call_function,view_as_real.default,backward,6,1,1,1,5429,690,6
+6294,view_1122,call_function,view.default,backward,6,1,1,1,5430,689,6
+6295,convert_element_type_1657,call_function,convert_element_type.default,backward,6,1,1,1,5431,688,6
+6296,view_1123,call_function,view.default,backward,6,1,1,1,5423,688,2
+6297,view_1124,call_function,view.default,backward,6,1,1,1,5432,688,5
+6298,view_1125,call_function,view.default,backward,6,1,1,1,5432,687,5
+6299,alias_default_1143,call_function,alias.default,backward,6,1,1,2,5424,687,4
+6300,einsum_default_501,call_function,einsum.default,backward,6,2,2,1,5425,3,5
+6301,permute_1007,call_function,permute.default,backward,6,1,1,1,4,683,3
+6302,einsum_default_502,call_function,einsum.default,backward,6,2,2,1,5426,682,5
+6303,permute_1008,call_function,permute.default,backward,6,1,1,1,5426,2,4
+6304,dtype_cast_451,call_function,dtype_cast.default,backward,6,1,1,1,5427,1,4
+6305,alias_default_1303,call_function,alias.default,backward,6,1,1,0,5428,0,3
+6306,alias_default_1144,call_function,alias.default,backward,6,1,1,2,5433,687,4
+6307,einsum_default_503,call_function,einsum.default,backward,6,2,2,1,5434,3,5
+6308,permute_1011,call_function,permute.default,backward,6,1,1,1,4,683,3
+6309,einsum_default_504,call_function,einsum.default,backward,6,2,2,1,5435,682,5
+6310,add_292,call_function,add.Tensor,unknown,,2,2,1,5442,681,10
+6311,permute_1012,call_function,permute.default,backward,6,1,1,1,5435,2,4
+6312,dtype_cast_452,call_function,dtype_cast.default,backward,6,1,1,1,5436,1,4
+6313,alias_default_1302,call_function,alias.default,backward,6,1,1,0,5437,0,3
+6314,alias_default_1145,call_function,alias.default,backward,6,1,1,2,5433,686,4
+6315,einsum_default_505,call_function,einsum.default,backward,6,2,2,1,5434,3,5
+6316,permute_1015,call_function,permute.default,backward,6,1,1,1,4,682,3
+6317,einsum_default_506,call_function,einsum.default,backward,6,2,2,1,5435,681,5
+6318,add_293,call_function,add.Tensor,unknown,,2,2,1,5458,680,10
+6319,permute_1016,call_function,permute.default,backward,6,1,1,1,5435,2,4
+6320,dtype_cast_453,call_function,dtype_cast.default,backward,6,1,1,1,5436,1,4
+6321,alias_default_1301,call_function,alias.default,backward,6,1,1,0,5437,0,3
+6322,convert_element_type_1670,call_function,convert_element_type.default,backward,6,1,1,1,5459,679,8
+6323,convert_element_type_1671,call_function,convert_element_type.default,backward,6,1,1,1,669,679,4
+6324,convert_element_type_1672,call_function,convert_element_type.default,backward,6,1,1,1,3,673,2
+6325,alias_default_1146,call_function,alias.default,backward,6,1,1,2,5460,678,4
+6326,mul_638,call_function,mul.Tensor,backward,6,2,2,1,5462,672,8
+6327,mul_639,call_function,mul.Tensor,backward,6,2,2,1,677,678,8
+6328,alias_default_1147,call_function,alias.default,backward,6,1,1,2,5463,671,4
+6329,alias_default_1148,call_function,alias.default,backward,6,1,1,3,678,677,4
+6330,mul_640,call_function,mul.Tensor,backward,6,2,2,1,5467,670,8
+6331,sum_89,call_function,sum.dim_IntList,backward,6,1,1,1,5468,669,5
+6332,div_72,call_function,div.Tensor,backward,6,1,1,1,679,669,6
+6333,mul_641,call_function,mul.Tensor,backward,6,2,2,1,5470,668,8
+6334,sub_66,call_function,sub.Tensor,backward,6,2,2,1,5471,667,10
+6335,mul_642,call_function,mul.Tensor,backward,6,2,2,1,5472,666,8
+6336,mul_643,call_function,mul.Tensor,backward,6,2,2,1,5464,4,8
+6337,sum_90,call_function,sum.dim_IntList,backward,6,1,1,1,5465,3,5
+6338,convert_element_type_1673,call_function,convert_element_type.default,backward,6,1,1,1,5473,665,6
+6339,convert_element_type_1674,call_function,convert_element_type.default,backward,6,1,1,1,5466,2,3
+6340,add_294,call_function,add.Tensor,unknown,,2,2,1,5474,664,10
+6341,dtype_cast_454,call_function,dtype_cast.default,backward,6,1,1,1,5467,1,3
+6342,alias_default_1308,call_function,alias.default,backward,6,1,1,0,5468,0,2
+6343,alias_default_1149,call_function,alias.default,unknown,,1,1,3,5475,663,4
+6344,einsum_default_507,call_function,einsum.default,backward,5,2,2,1,5476,3,5
+6345,permute_1019,call_function,permute.default,backward,5,1,1,1,4,659,3
+6346,einsum_default_508,call_function,einsum.default,backward,5,2,2,1,5477,658,5
+6347,permute_1020,call_function,permute.default,backward,5,1,1,1,5477,2,4
+6348,dtype_cast_455,call_function,dtype_cast.default,backward,5,1,1,1,5478,1,4
+6349,alias_default_1297,call_function,alias.default,backward,5,1,1,0,5479,0,3
+6350,alias_default_1150,call_function,alias.default,backward,5,1,1,2,5478,657,4
+6351,mul_644,call_function,mul.Tensor,backward,5,2,2,1,5479,645,8
+6352,mul_645,call_function,mul.Tensor,backward,5,2,2,1,5479,649,8
+6353,alias_default_1151,call_function,alias.default,backward,5,1,1,2,5480,644,4
+6354,einsum_default_509,call_function,einsum.default,backward,5,2,2,1,5481,3,5
+6355,permute_1023,call_function,permute.default,backward,5,1,1,1,4,640,3
+6356,einsum_default_510,call_function,einsum.default,backward,5,2,2,1,5482,639,5
+6357,permute_1024,call_function,permute.default,backward,5,1,1,1,5482,2,4
+6358,dtype_cast_456,call_function,dtype_cast.default,backward,5,1,1,1,5483,1,4
+6359,alias_default_1298,call_function,alias.default,backward,5,1,1,0,5484,0,3
+6360,convert_element_type_1683,call_function,convert_element_type.default,backward,5,1,1,1,5480,648,6
+6361,convert_element_type_1684,call_function,convert_element_type.default,backward,5,1,1,1,646,658,4
+6362,alias_default_1152,call_function,alias.default,backward,5,1,1,2,647,657,4
+6363,neg_50,call_function,neg.default,backward,5,1,1,1,648,656,8
+6364,exp_50,call_function,exp.default,backward,5,1,1,1,649,655,6
+6365,add_295,call_function,add.Tensor,backward,5,1,1,1,650,654,4
+6366,reciprocal_22,call_function,reciprocal.default,backward,5,1,1,1,651,653,4
+6367,mul_646,call_function,mul.Tensor,backward,5,1,1,1,652,652,6
+6368,alias_default_1153,call_function,alias.default,backward,5,1,1,2,653,651,4
+6369,mul_647,call_function,mul.Tensor,backward,5,2,2,1,5489,647,8
+6370,sub_67,call_function,sub.Tensor,backward,5,1,1,1,654,649,4
+6371,mul_648,call_function,mul.Tensor,backward,5,2,2,1,655,648,8
+6372,add_296,call_function,add.Tensor,backward,5,1,1,1,656,647,4
+6373,mul_649,call_function,mul.Tensor,backward,5,2,2,1,5493,646,8
+6374,convert_element_type_1685,call_function,convert_element_type.default,backward,5,1,1,1,5494,645,6
+6375,alias_default_1154,call_function,alias.default,backward,5,1,1,2,5495,644,4
+6376,einsum_default_511,call_function,einsum.default,backward,5,2,2,1,5496,3,5
+6377,permute_1027,call_function,permute.default,backward,5,1,1,1,4,640,3
+6378,einsum_default_512,call_function,einsum.default,backward,5,2,2,1,5497,639,5
+6379,add_297,call_function,add.Tensor,unknown,,2,2,1,5502,638,10
+6380,permute_1028,call_function,permute.default,backward,5,1,1,1,5497,2,4
+6381,dtype_cast_457,call_function,dtype_cast.default,backward,5,1,1,1,5498,1,4
+6382,alias_default_1296,call_function,alias.default,backward,5,1,1,0,5499,0,3
+6383,convert_element_type_1690,call_function,convert_element_type.default,backward,5,1,1,1,5503,637,8
+6384,convert_element_type_1691,call_function,convert_element_type.default,backward,5,1,1,1,626,637,4
+6385,convert_element_type_1692,call_function,convert_element_type.default,backward,5,1,1,1,3,631,2
+6386,alias_default_1155,call_function,alias.default,backward,5,1,1,2,5504,636,4
+6387,mul_650,call_function,mul.Tensor,backward,5,2,2,1,5506,630,8
+6388,mul_651,call_function,mul.Tensor,backward,5,2,2,1,634,636,8
+6389,alias_default_1156,call_function,alias.default,backward,5,1,1,2,5507,629,4
+6390,alias_default_1157,call_function,alias.default,backward,5,1,1,3,635,635,4
+6391,mul_652,call_function,mul.Tensor,backward,5,2,2,1,5511,628,8
+6392,sum_91,call_function,sum.dim_IntList,backward,5,1,1,1,5512,627,5
+6393,div_73,call_function,div.Tensor,backward,5,1,1,1,636,627,6
+6394,mul_653,call_function,mul.Tensor,backward,5,2,2,1,5514,626,8
+6395,sub_68,call_function,sub.Tensor,backward,5,2,2,1,5515,625,10
+6396,mul_654,call_function,mul.Tensor,backward,5,2,2,1,5516,624,8
+6397,mul_655,call_function,mul.Tensor,backward,5,2,2,1,5508,4,8
+6398,sum_92,call_function,sum.dim_IntList,backward,5,1,1,1,5509,3,5
+6399,convert_element_type_1693,call_function,convert_element_type.default,backward,5,1,1,1,5517,623,6
+6400,convert_element_type_1694,call_function,convert_element_type.default,backward,5,1,1,1,5510,2,3
+6401,add_298,call_function,add.Tensor,unknown,,2,2,1,5518,622,10
+6402,dtype_cast_458,call_function,dtype_cast.default,backward,5,1,1,1,5511,1,3
+6403,alias_default_1300,call_function,alias.default,backward,5,1,1,0,5512,0,2
+6404,alias_default_1158,call_function,alias.default,unknown,,1,1,3,5519,621,4
+6405,einsum_default_513,call_function,einsum.default,backward,5,2,2,1,5520,3,5
+6406,permute_1031,call_function,permute.default,backward,5,1,1,1,4,617,3
+6407,einsum_default_514,call_function,einsum.default,backward,5,2,2,1,5521,616,5
+6408,permute_1032,call_function,permute.default,backward,5,1,1,1,5521,2,4
+6409,dtype_cast_459,call_function,dtype_cast.default,backward,5,1,1,1,5522,1,4
+6410,alias_default_1295,call_function,alias.default,backward,5,1,1,0,5523,0,3
+6411,view_1140,call_function,view.default,backward,5,1,1,1,5522,615,4
+6412,permute_1033,call_function,permute.default,backward,5,1,1,1,5523,614,4
+6413,_scaled_dot_product_flash_attention_backward_22,call_function,_scaled_dot_product_flash_attention_backward.default,backward,5,8,8,3,5527,613,2
+6414,getitem_318,call_function,getitem,backward,5,1,1,1,5528,586,2
+6415,getitem_319,call_function,getitem,backward,5,1,1,1,5528,587,2
+6416,getitem_320,call_function,getitem,backward,5,1,1,1,5528,580,2
+6417,permute_1034,call_function,permute.default,backward,5,1,1,1,5529,579,2
+6418,permute_1035,call_function,permute.default,backward,5,1,1,1,5529,586,2
+6419,permute_1036,call_function,permute.default,backward,5,1,1,1,5529,585,2
+6420,convert_element_type_1699,call_function,convert_element_type.default,backward,5,1,1,1,5530,585,2
+6421,convert_element_type_1700,call_function,convert_element_type.default,backward,5,1,1,1,5530,584,2
+6422,view_1141,call_function,view.default,backward,5,1,1,1,5531,584,2
+6423,view_as_complex_100,call_function,view_as_complex.default,backward,5,1,1,1,5532,583,6
+6424,_conj_44,call_function,_conj.default,backward,5,1,1,1,4,584,3
+6425,clone_182,call_function,clone.default,backward,5,1,1,1,5,583,3
+6426,mul_656,call_function,mul.Tensor,backward,5,2,2,1,5535,582,8
+6427,view_1142,call_function,view.default,backward,5,1,1,1,5531,583,2
+6428,view_as_complex_101,call_function,view_as_complex.default,backward,5,1,1,1,5532,582,6
+6429,_conj_45,call_function,_conj.default,backward,5,1,1,1,4,583,3
+6430,clone_183,call_function,clone.default,backward,5,1,1,1,5,582,3
+6431,mul_657,call_function,mul.Tensor,backward,5,2,2,1,5535,581,8
+6432,view_as_real_100,call_function,view_as_real.default,backward,5,1,1,1,5536,581,6
+6433,view_1143,call_function,view.default,backward,5,1,1,1,5537,580,6
+6434,convert_element_type_1701,call_function,convert_element_type.default,backward,5,1,1,1,5538,579,6
+6435,view_as_real_101,call_function,view_as_real.default,backward,5,1,1,1,5536,580,6
+6436,view_1144,call_function,view.default,backward,5,1,1,1,5537,579,6
+6437,convert_element_type_1702,call_function,convert_element_type.default,backward,5,1,1,1,5538,578,6
+6438,view_1145,call_function,view.default,backward,5,1,1,1,5530,578,2
+6439,view_1146,call_function,view.default,backward,5,1,1,1,5539,578,5
+6440,view_1147,call_function,view.default,backward,5,1,1,1,5539,577,5
+6441,alias_default_1159,call_function,alias.default,backward,5,1,1,2,5531,577,4
+6442,einsum_default_515,call_function,einsum.default,backward,5,2,2,1,5532,3,5
+6443,permute_1039,call_function,permute.default,backward,5,1,1,1,4,573,3
+6444,einsum_default_516,call_function,einsum.default,backward,5,2,2,1,5533,572,5
+6445,permute_1040,call_function,permute.default,backward,5,1,1,1,5533,2,4
+6446,dtype_cast_460,call_function,dtype_cast.default,backward,5,1,1,1,5534,1,4
+6447,alias_default_1294,call_function,alias.default,backward,5,1,1,0,5535,0,3
+6448,alias_default_1160,call_function,alias.default,backward,5,1,1,2,5540,577,4
+6449,einsum_default_517,call_function,einsum.default,backward,5,2,2,1,5541,3,5
+6450,permute_1043,call_function,permute.default,backward,5,1,1,1,4,573,3
+6451,einsum_default_518,call_function,einsum.default,backward,5,2,2,1,5542,572,5
+6452,add_299,call_function,add.Tensor,unknown,,2,2,1,5549,571,10
+6453,permute_1044,call_function,permute.default,backward,5,1,1,1,5542,2,4
+6454,dtype_cast_461,call_function,dtype_cast.default,backward,5,1,1,1,5543,1,4
+6455,alias_default_1293,call_function,alias.default,backward,5,1,1,0,5544,0,3
+6456,alias_default_1161,call_function,alias.default,backward,5,1,1,2,5540,576,4
+6457,einsum_default_519,call_function,einsum.default,backward,5,2,2,1,5541,3,5
+6458,permute_1047,call_function,permute.default,backward,5,1,1,1,4,572,3
+6459,einsum_default_520,call_function,einsum.default,backward,5,2,2,1,5542,571,5
+6460,add_300,call_function,add.Tensor,unknown,,2,2,1,5565,570,10
+6461,permute_1048,call_function,permute.default,backward,5,1,1,1,5542,2,4
+6462,dtype_cast_462,call_function,dtype_cast.default,backward,5,1,1,1,5543,1,4
+6463,alias_default_1292,call_function,alias.default,backward,5,1,1,0,5544,0,3
+6464,convert_element_type_1715,call_function,convert_element_type.default,backward,5,1,1,1,5566,569,8
+6465,convert_element_type_1716,call_function,convert_element_type.default,backward,5,1,1,1,559,569,4
+6466,convert_element_type_1717,call_function,convert_element_type.default,backward,5,1,1,1,3,563,2
+6467,alias_default_1162,call_function,alias.default,backward,5,1,1,2,5567,568,4
+6468,mul_658,call_function,mul.Tensor,backward,5,2,2,1,5569,562,8
+6469,mul_659,call_function,mul.Tensor,backward,5,2,2,1,567,568,8
+6470,alias_default_1163,call_function,alias.default,backward,5,1,1,2,5570,561,4
+6471,alias_default_1164,call_function,alias.default,backward,5,1,1,3,568,567,4
+6472,mul_660,call_function,mul.Tensor,backward,5,2,2,1,5574,560,8
+6473,sum_93,call_function,sum.dim_IntList,backward,5,1,1,1,5575,559,5
+6474,div_74,call_function,div.Tensor,backward,5,1,1,1,569,559,6
+6475,mul_661,call_function,mul.Tensor,backward,5,2,2,1,5577,558,8
+6476,sub_69,call_function,sub.Tensor,backward,5,2,2,1,5578,557,10
+6477,mul_662,call_function,mul.Tensor,backward,5,2,2,1,5579,556,8
+6478,mul_663,call_function,mul.Tensor,backward,5,2,2,1,5571,4,8
+6479,sum_94,call_function,sum.dim_IntList,backward,5,1,1,1,5572,3,5
+6480,convert_element_type_1718,call_function,convert_element_type.default,backward,5,1,1,1,5580,555,6
+6481,convert_element_type_1719,call_function,convert_element_type.default,backward,5,1,1,1,5573,2,3
+6482,add_301,call_function,add.Tensor,unknown,,2,2,1,5581,554,10
+6483,dtype_cast_463,call_function,dtype_cast.default,backward,5,1,1,1,5574,1,3
+6484,alias_default_1299,call_function,alias.default,backward,5,1,1,0,5575,0,2
+6485,alias_default_1165,call_function,alias.default,unknown,,1,1,3,5582,553,4
+6486,einsum_default_521,call_function,einsum.default,backward,4,2,2,1,5583,3,5
+6487,permute_1051,call_function,permute.default,backward,4,1,1,1,4,549,3
+6488,einsum_default_522,call_function,einsum.default,backward,4,2,2,1,5584,548,5
+6489,permute_1052,call_function,permute.default,backward,4,1,1,1,5584,2,4
+6490,dtype_cast_464,call_function,dtype_cast.default,backward,4,1,1,1,5585,1,4
+6491,alias_default_1288,call_function,alias.default,backward,4,1,1,0,5586,0,3
+6492,alias_default_1166,call_function,alias.default,backward,4,1,1,2,5585,547,4
+6493,mul_664,call_function,mul.Tensor,backward,4,2,2,1,5586,535,8
+6494,mul_665,call_function,mul.Tensor,backward,4,2,2,1,5586,539,8
+6495,alias_default_1167,call_function,alias.default,backward,4,1,1,2,5587,534,4
+6496,einsum_default_523,call_function,einsum.default,backward,4,2,2,1,5588,3,5
+6497,permute_1055,call_function,permute.default,backward,4,1,1,1,4,530,3
+6498,einsum_default_524,call_function,einsum.default,backward,4,2,2,1,5589,529,5
+6499,permute_1056,call_function,permute.default,backward,4,1,1,1,5589,2,4
+6500,dtype_cast_465,call_function,dtype_cast.default,backward,4,1,1,1,5590,1,4
+6501,alias_default_1289,call_function,alias.default,backward,4,1,1,0,5591,0,3
+6502,convert_element_type_1728,call_function,convert_element_type.default,backward,4,1,1,1,5587,538,6
+6503,convert_element_type_1729,call_function,convert_element_type.default,backward,4,1,1,1,536,548,4
+6504,alias_default_1168,call_function,alias.default,backward,4,1,1,2,537,547,4
+6505,neg_51,call_function,neg.default,backward,4,1,1,1,538,546,8
+6506,exp_51,call_function,exp.default,backward,4,1,1,1,539,545,6
+6507,add_302,call_function,add.Tensor,backward,4,1,1,1,540,544,4
+6508,reciprocal_23,call_function,reciprocal.default,backward,4,1,1,1,541,543,4
+6509,mul_666,call_function,mul.Tensor,backward,4,1,1,1,542,542,6
+6510,alias_default_1169,call_function,alias.default,backward,4,1,1,2,543,541,4
+6511,mul_667,call_function,mul.Tensor,backward,4,2,2,1,5596,537,8
+6512,sub_70,call_function,sub.Tensor,backward,4,1,1,1,544,539,4
+6513,mul_668,call_function,mul.Tensor,backward,4,2,2,1,545,538,8
+6514,add_303,call_function,add.Tensor,backward,4,1,1,1,546,537,4
+6515,mul_669,call_function,mul.Tensor,backward,4,2,2,1,5600,536,8
+6516,convert_element_type_1730,call_function,convert_element_type.default,backward,4,1,1,1,5601,535,6
+6517,alias_default_1170,call_function,alias.default,backward,4,1,1,2,5602,534,4
+6518,einsum_default_525,call_function,einsum.default,backward,4,2,2,1,5603,3,5
+6519,permute_1059,call_function,permute.default,backward,4,1,1,1,4,530,3
+6520,einsum_default_526,call_function,einsum.default,backward,4,2,2,1,5604,529,5
+6521,add_304,call_function,add.Tensor,unknown,,2,2,1,5609,528,10
+6522,permute_1060,call_function,permute.default,backward,4,1,1,1,5604,2,4
+6523,dtype_cast_466,call_function,dtype_cast.default,backward,4,1,1,1,5605,1,4
+6524,alias_default_1287,call_function,alias.default,backward,4,1,1,0,5606,0,3
+6525,convert_element_type_1735,call_function,convert_element_type.default,backward,4,1,1,1,5610,527,8
+6526,convert_element_type_1736,call_function,convert_element_type.default,backward,4,1,1,1,516,527,4
+6527,convert_element_type_1737,call_function,convert_element_type.default,backward,4,1,1,1,3,521,2
+6528,alias_default_1171,call_function,alias.default,backward,4,1,1,2,5611,526,4
+6529,mul_670,call_function,mul.Tensor,backward,4,2,2,1,5613,520,8
+6530,mul_671,call_function,mul.Tensor,backward,4,2,2,1,524,526,8
+6531,alias_default_1172,call_function,alias.default,backward,4,1,1,2,5614,519,4
+6532,alias_default_1173,call_function,alias.default,backward,4,1,1,3,525,525,4
+6533,mul_672,call_function,mul.Tensor,backward,4,2,2,1,5618,518,8
+6534,sum_95,call_function,sum.dim_IntList,backward,4,1,1,1,5619,517,5
+6535,div_75,call_function,div.Tensor,backward,4,1,1,1,526,517,6
+6536,mul_673,call_function,mul.Tensor,backward,4,2,2,1,5621,516,8
+6537,sub_71,call_function,sub.Tensor,backward,4,2,2,1,5622,515,10
+6538,mul_674,call_function,mul.Tensor,backward,4,2,2,1,5623,514,8
+6539,mul_675,call_function,mul.Tensor,backward,4,2,2,1,5615,4,8
+6540,sum_96,call_function,sum.dim_IntList,backward,4,1,1,1,5616,3,5
+6541,convert_element_type_1738,call_function,convert_element_type.default,backward,4,1,1,1,5624,513,6
+6542,convert_element_type_1739,call_function,convert_element_type.default,backward,4,1,1,1,5617,2,3
+6543,add_305,call_function,add.Tensor,unknown,,2,2,1,5625,512,10
+6544,dtype_cast_467,call_function,dtype_cast.default,backward,4,1,1,1,5618,1,3
+6545,alias_default_1291,call_function,alias.default,backward,4,1,1,0,5619,0,2
+6546,alias_default_1174,call_function,alias.default,unknown,,1,1,3,5626,511,4
+6547,einsum_default_527,call_function,einsum.default,backward,4,2,2,1,5627,3,5
+6548,permute_1063,call_function,permute.default,backward,4,1,1,1,4,507,3
+6549,einsum_default_528,call_function,einsum.default,backward,4,2,2,1,5628,506,5
+6550,permute_1064,call_function,permute.default,backward,4,1,1,1,5628,2,4
+6551,dtype_cast_468,call_function,dtype_cast.default,backward,4,1,1,1,5629,1,4
+6552,alias_default_1286,call_function,alias.default,backward,4,1,1,0,5630,0,3
+6553,view_1162,call_function,view.default,backward,4,1,1,1,5629,505,4
+6554,permute_1065,call_function,permute.default,backward,4,1,1,1,5630,504,4
+6555,_scaled_dot_product_flash_attention_backward_23,call_function,_scaled_dot_product_flash_attention_backward.default,backward,4,8,8,3,5634,503,2
+6556,getitem_321,call_function,getitem,backward,4,1,1,1,5635,476,2
+6557,getitem_322,call_function,getitem,backward,4,1,1,1,5635,477,2
+6558,getitem_323,call_function,getitem,backward,4,1,1,1,5635,470,2
+6559,permute_1066,call_function,permute.default,backward,4,1,1,1,5636,469,2
+6560,permute_1067,call_function,permute.default,backward,4,1,1,1,5636,476,2
+6561,permute_1068,call_function,permute.default,backward,4,1,1,1,5636,475,2
+6562,convert_element_type_1744,call_function,convert_element_type.default,backward,4,1,1,1,5637,475,2
+6563,convert_element_type_1745,call_function,convert_element_type.default,backward,4,1,1,1,5637,474,2
+6564,view_1163,call_function,view.default,backward,4,1,1,1,5638,474,2
+6565,view_as_complex_102,call_function,view_as_complex.default,backward,4,1,1,1,5639,473,6
+6566,_conj_46,call_function,_conj.default,backward,4,1,1,1,4,474,3
+6567,clone_190,call_function,clone.default,backward,4,1,1,1,5,473,3
+6568,mul_676,call_function,mul.Tensor,backward,4,2,2,1,5642,472,8
+6569,view_1164,call_function,view.default,backward,4,1,1,1,5638,473,2
+6570,view_as_complex_103,call_function,view_as_complex.default,backward,4,1,1,1,5639,472,6
+6571,_conj_47,call_function,_conj.default,backward,4,1,1,1,4,473,3
+6572,clone_191,call_function,clone.default,backward,4,1,1,1,5,472,3
+6573,mul_677,call_function,mul.Tensor,backward,4,2,2,1,5642,471,8
+6574,view_as_real_102,call_function,view_as_real.default,backward,4,1,1,1,5643,471,6
+6575,view_1165,call_function,view.default,backward,4,1,1,1,5644,470,6
+6576,convert_element_type_1746,call_function,convert_element_type.default,backward,4,1,1,1,5645,469,6
+6577,view_as_real_103,call_function,view_as_real.default,backward,4,1,1,1,5643,470,6
+6578,view_1166,call_function,view.default,backward,4,1,1,1,5644,469,6
+6579,convert_element_type_1747,call_function,convert_element_type.default,backward,4,1,1,1,5645,468,6
+6580,view_1167,call_function,view.default,backward,4,1,1,1,5637,468,2
+6581,view_1168,call_function,view.default,backward,4,1,1,1,5646,468,5
+6582,view_1169,call_function,view.default,backward,4,1,1,1,5646,467,5
+6583,alias_default_1175,call_function,alias.default,backward,4,1,1,2,5638,467,4
+6584,einsum_default_529,call_function,einsum.default,backward,4,2,2,1,5639,3,5
+6585,permute_1071,call_function,permute.default,backward,4,1,1,1,4,463,3
+6586,einsum_default_530,call_function,einsum.default,backward,4,2,2,1,5640,462,5
+6587,permute_1072,call_function,permute.default,backward,4,1,1,1,5640,2,4
+6588,dtype_cast_469,call_function,dtype_cast.default,backward,4,1,1,1,5641,1,4
+6589,alias_default_1285,call_function,alias.default,backward,4,1,1,0,5642,0,3
+6590,alias_default_1176,call_function,alias.default,backward,4,1,1,2,5647,467,4
+6591,einsum_default_531,call_function,einsum.default,backward,4,2,2,1,5648,3,5
+6592,permute_1075,call_function,permute.default,backward,4,1,1,1,4,463,3
+6593,einsum_default_532,call_function,einsum.default,backward,4,2,2,1,5649,462,5
+6594,add_306,call_function,add.Tensor,unknown,,2,2,1,5656,461,10
+6595,permute_1076,call_function,permute.default,backward,4,1,1,1,5649,2,4
+6596,dtype_cast_470,call_function,dtype_cast.default,backward,4,1,1,1,5650,1,4
+6597,alias_default_1284,call_function,alias.default,backward,4,1,1,0,5651,0,3
+6598,alias_default_1177,call_function,alias.default,backward,4,1,1,2,5647,466,4
+6599,einsum_default_533,call_function,einsum.default,backward,4,2,2,1,5648,3,5
+6600,permute_1079,call_function,permute.default,backward,4,1,1,1,4,462,3
+6601,einsum_default_534,call_function,einsum.default,backward,4,2,2,1,5649,461,5
+6602,add_307,call_function,add.Tensor,unknown,,2,2,1,5672,460,10
+6603,permute_1080,call_function,permute.default,backward,4,1,1,1,5649,2,4
+6604,dtype_cast_471,call_function,dtype_cast.default,backward,4,1,1,1,5650,1,4
+6605,alias_default_1283,call_function,alias.default,backward,4,1,1,0,5651,0,3
+6606,convert_element_type_1760,call_function,convert_element_type.default,backward,4,1,1,1,5673,459,8
+6607,convert_element_type_1761,call_function,convert_element_type.default,backward,4,1,1,1,449,459,4
+6608,convert_element_type_1762,call_function,convert_element_type.default,backward,4,1,1,1,3,453,2
+6609,alias_default_1178,call_function,alias.default,backward,4,1,1,2,5674,458,4
+6610,mul_678,call_function,mul.Tensor,backward,4,2,2,1,5676,452,8
+6611,mul_679,call_function,mul.Tensor,backward,4,2,2,1,457,458,8
+6612,alias_default_1179,call_function,alias.default,backward,4,1,1,2,5677,451,4
+6613,alias_default_1180,call_function,alias.default,backward,4,1,1,3,458,457,4
+6614,mul_680,call_function,mul.Tensor,backward,4,2,2,1,5681,450,8
+6615,sum_97,call_function,sum.dim_IntList,backward,4,1,1,1,5682,449,5
+6616,div_76,call_function,div.Tensor,backward,4,1,1,1,459,449,6
+6617,mul_681,call_function,mul.Tensor,backward,4,2,2,1,5684,448,8
+6618,sub_72,call_function,sub.Tensor,backward,4,2,2,1,5685,447,10
+6619,mul_682,call_function,mul.Tensor,backward,4,2,2,1,5686,446,8
+6620,mul_683,call_function,mul.Tensor,backward,4,2,2,1,5678,4,8
+6621,sum_98,call_function,sum.dim_IntList,backward,4,1,1,1,5679,3,5
+6622,convert_element_type_1763,call_function,convert_element_type.default,backward,4,1,1,1,5687,445,6
+6623,convert_element_type_1764,call_function,convert_element_type.default,backward,4,1,1,1,5680,2,3
+6624,add_308,call_function,add.Tensor,unknown,,2,2,1,5688,444,10
+6625,dtype_cast_472,call_function,dtype_cast.default,backward,4,1,1,1,5681,1,3
+6626,alias_default_1290,call_function,alias.default,backward,4,1,1,0,5682,0,2
+6627,alias_default_1181,call_function,alias.default,unknown,,1,1,3,5689,443,4
+6628,einsum_default_535,call_function,einsum.default,backward,3,2,2,1,5690,3,5
+6629,permute_1083,call_function,permute.default,backward,3,1,1,1,4,439,3
+6630,einsum_default_536,call_function,einsum.default,backward,3,2,2,1,5691,438,5
+6631,permute_1084,call_function,permute.default,backward,3,1,1,1,5691,2,4
+6632,dtype_cast_473,call_function,dtype_cast.default,backward,3,1,1,1,5692,1,4
+6633,alias_default_1279,call_function,alias.default,backward,3,1,1,0,5693,0,3
+6634,alias_default_1182,call_function,alias.default,backward,3,1,1,2,5692,437,4
+6635,mul_684,call_function,mul.Tensor,backward,3,2,2,1,5693,425,8
+6636,mul_685,call_function,mul.Tensor,backward,3,2,2,1,5693,429,8
+6637,alias_default_1183,call_function,alias.default,backward,3,1,1,2,5694,424,4
+6638,einsum_default_537,call_function,einsum.default,backward,3,2,2,1,5695,3,5
+6639,permute_1087,call_function,permute.default,backward,3,1,1,1,4,420,3
+6640,einsum_default_538,call_function,einsum.default,backward,3,2,2,1,5696,419,5
+6641,permute_1088,call_function,permute.default,backward,3,1,1,1,5696,2,4
+6642,dtype_cast_474,call_function,dtype_cast.default,backward,3,1,1,1,5697,1,4
+6643,alias_default_1280,call_function,alias.default,backward,3,1,1,0,5698,0,3
+6644,convert_element_type_1773,call_function,convert_element_type.default,backward,3,1,1,1,5694,428,6
+6645,convert_element_type_1774,call_function,convert_element_type.default,backward,3,1,1,1,426,438,4
+6646,alias_default_1184,call_function,alias.default,backward,3,1,1,2,427,437,4
+6647,neg_52,call_function,neg.default,backward,3,1,1,1,428,436,8
+6648,exp_52,call_function,exp.default,backward,3,1,1,1,429,435,6
+6649,add_309,call_function,add.Tensor,backward,3,1,1,1,430,434,4
+6650,reciprocal_24,call_function,reciprocal.default,backward,3,1,1,1,431,433,4
+6651,mul_686,call_function,mul.Tensor,backward,3,1,1,1,432,432,6
+6652,alias_default_1185,call_function,alias.default,backward,3,1,1,2,433,431,4
+6653,mul_687,call_function,mul.Tensor,backward,3,2,2,1,5703,427,8
+6654,sub_73,call_function,sub.Tensor,backward,3,1,1,1,434,429,4
+6655,mul_688,call_function,mul.Tensor,backward,3,2,2,1,435,428,8
+6656,add_310,call_function,add.Tensor,backward,3,1,1,1,436,427,4
+6657,mul_689,call_function,mul.Tensor,backward,3,2,2,1,5707,426,8
+6658,convert_element_type_1775,call_function,convert_element_type.default,backward,3,1,1,1,5708,425,6
+6659,alias_default_1186,call_function,alias.default,backward,3,1,1,2,5709,424,4
+6660,einsum_default_539,call_function,einsum.default,backward,3,2,2,1,5710,3,5
+6661,permute_1091,call_function,permute.default,backward,3,1,1,1,4,420,3
+6662,einsum_default_540,call_function,einsum.default,backward,3,2,2,1,5711,419,5
+6663,add_311,call_function,add.Tensor,unknown,,2,2,1,5716,418,10
+6664,permute_1092,call_function,permute.default,backward,3,1,1,1,5711,2,4
+6665,dtype_cast_475,call_function,dtype_cast.default,backward,3,1,1,1,5712,1,4
+6666,alias_default_1278,call_function,alias.default,backward,3,1,1,0,5713,0,3
+6667,convert_element_type_1780,call_function,convert_element_type.default,backward,3,1,1,1,5717,417,8
+6668,convert_element_type_1781,call_function,convert_element_type.default,backward,3,1,1,1,406,417,4
+6669,convert_element_type_1782,call_function,convert_element_type.default,backward,3,1,1,1,3,411,2
+6670,alias_default_1187,call_function,alias.default,backward,3,1,1,2,5718,416,4
+6671,mul_690,call_function,mul.Tensor,backward,3,2,2,1,5720,410,8
+6672,mul_691,call_function,mul.Tensor,backward,3,2,2,1,414,416,8
+6673,alias_default_1188,call_function,alias.default,backward,3,1,1,2,5721,409,4
+6674,alias_default_1189,call_function,alias.default,backward,3,1,1,3,415,415,4
+6675,mul_692,call_function,mul.Tensor,backward,3,2,2,1,5725,408,8
+6676,sum_99,call_function,sum.dim_IntList,backward,3,1,1,1,5726,407,5
+6677,div_77,call_function,div.Tensor,backward,3,1,1,1,416,407,6
+6678,mul_693,call_function,mul.Tensor,backward,3,2,2,1,5728,406,8
+6679,sub_74,call_function,sub.Tensor,backward,3,2,2,1,5729,405,10
+6680,mul_694,call_function,mul.Tensor,backward,3,2,2,1,5730,404,8
+6681,mul_695,call_function,mul.Tensor,backward,3,2,2,1,5722,4,8
+6682,sum_100,call_function,sum.dim_IntList,backward,3,1,1,1,5723,3,5
+6683,convert_element_type_1783,call_function,convert_element_type.default,backward,3,1,1,1,5731,403,6
+6684,convert_element_type_1784,call_function,convert_element_type.default,backward,3,1,1,1,5724,2,3
+6685,add_312,call_function,add.Tensor,unknown,,2,2,1,5732,402,10
+6686,dtype_cast_476,call_function,dtype_cast.default,backward,3,1,1,1,5725,1,3
+6687,alias_default_1282,call_function,alias.default,backward,3,1,1,0,5726,0,2
+6688,alias_default_1190,call_function,alias.default,unknown,,1,1,3,5733,401,4
+6689,einsum_default_541,call_function,einsum.default,backward,3,2,2,1,5734,3,5
+6690,permute_1095,call_function,permute.default,backward,3,1,1,1,4,397,3
+6691,einsum_default_542,call_function,einsum.default,backward,3,2,2,1,5735,396,5
+6692,permute_1096,call_function,permute.default,backward,3,1,1,1,5735,2,4
+6693,dtype_cast_477,call_function,dtype_cast.default,backward,3,1,1,1,5736,1,4
+6694,alias_default_1277,call_function,alias.default,backward,3,1,1,0,5737,0,3
+6695,view_1184,call_function,view.default,backward,3,1,1,1,5736,395,4
+6696,permute_1097,call_function,permute.default,backward,3,1,1,1,5737,394,4
+6697,_scaled_dot_product_flash_attention_backward_24,call_function,_scaled_dot_product_flash_attention_backward.default,backward,3,8,8,3,5741,393,2
+6698,getitem_324,call_function,getitem,backward,3,1,1,1,5742,366,2
+6699,getitem_325,call_function,getitem,backward,3,1,1,1,5742,367,2
+6700,getitem_326,call_function,getitem,backward,3,1,1,1,5742,360,2
+6701,permute_1098,call_function,permute.default,backward,3,1,1,1,5743,359,2
+6702,permute_1099,call_function,permute.default,backward,3,1,1,1,5743,366,2
+6703,permute_1100,call_function,permute.default,backward,3,1,1,1,5743,365,2
+6704,convert_element_type_1789,call_function,convert_element_type.default,backward,3,1,1,1,5744,365,2
+6705,convert_element_type_1790,call_function,convert_element_type.default,backward,3,1,1,1,5744,364,2
+6706,view_1185,call_function,view.default,backward,3,1,1,1,5745,364,2
+6707,view_as_complex_104,call_function,view_as_complex.default,backward,3,1,1,1,5746,363,6
+6708,_conj_48,call_function,_conj.default,backward,3,1,1,1,4,364,3
+6709,clone_198,call_function,clone.default,backward,3,1,1,1,5,363,3
+6710,mul_696,call_function,mul.Tensor,backward,3,2,2,1,5749,362,8
+6711,view_1186,call_function,view.default,backward,3,1,1,1,5745,363,2
+6712,view_as_complex_105,call_function,view_as_complex.default,backward,3,1,1,1,5746,362,6
+6713,_conj_49,call_function,_conj.default,backward,3,1,1,1,4,363,3
+6714,clone_199,call_function,clone.default,backward,3,1,1,1,5,362,3
+6715,mul_697,call_function,mul.Tensor,backward,3,2,2,1,5749,361,8
+6716,view_as_real_104,call_function,view_as_real.default,backward,3,1,1,1,5750,361,6
+6717,view_1187,call_function,view.default,backward,3,1,1,1,5751,360,6
+6718,convert_element_type_1791,call_function,convert_element_type.default,backward,3,1,1,1,5752,359,6
+6719,view_as_real_105,call_function,view_as_real.default,backward,3,1,1,1,5750,360,6
+6720,view_1188,call_function,view.default,backward,3,1,1,1,5751,359,6
+6721,convert_element_type_1792,call_function,convert_element_type.default,backward,3,1,1,1,5752,358,6
+6722,view_1189,call_function,view.default,backward,3,1,1,1,5744,358,2
+6723,view_1190,call_function,view.default,backward,3,1,1,1,5753,358,5
+6724,view_1191,call_function,view.default,backward,3,1,1,1,5753,357,5
+6725,alias_default_1191,call_function,alias.default,backward,3,1,1,2,5745,357,4
+6726,einsum_default_543,call_function,einsum.default,backward,3,2,2,1,5746,3,5
+6727,permute_1103,call_function,permute.default,backward,3,1,1,1,4,353,3
+6728,einsum_default_544,call_function,einsum.default,backward,3,2,2,1,5747,352,5
+6729,permute_1104,call_function,permute.default,backward,3,1,1,1,5747,2,4
+6730,dtype_cast_478,call_function,dtype_cast.default,backward,3,1,1,1,5748,1,4
+6731,alias_default_1276,call_function,alias.default,backward,3,1,1,0,5749,0,3
+6732,alias_default_1192,call_function,alias.default,backward,3,1,1,2,5754,357,4
+6733,einsum_default_545,call_function,einsum.default,backward,3,2,2,1,5755,3,5
+6734,permute_1107,call_function,permute.default,backward,3,1,1,1,4,353,3
+6735,einsum_default_546,call_function,einsum.default,backward,3,2,2,1,5756,352,5
+6736,add_313,call_function,add.Tensor,unknown,,2,2,1,5763,351,10
+6737,permute_1108,call_function,permute.default,backward,3,1,1,1,5756,2,4
+6738,dtype_cast_479,call_function,dtype_cast.default,backward,3,1,1,1,5757,1,4
+6739,alias_default_1275,call_function,alias.default,backward,3,1,1,0,5758,0,3
+6740,alias_default_1193,call_function,alias.default,backward,3,1,1,2,5754,356,4
+6741,einsum_default_547,call_function,einsum.default,backward,3,2,2,1,5755,3,5
+6742,permute_1111,call_function,permute.default,backward,3,1,1,1,4,352,3
+6743,einsum_default_548,call_function,einsum.default,backward,3,2,2,1,5756,351,5
+6744,add_314,call_function,add.Tensor,unknown,,2,2,1,5779,350,10
+6745,permute_1112,call_function,permute.default,backward,3,1,1,1,5756,2,4
+6746,dtype_cast_480,call_function,dtype_cast.default,backward,3,1,1,1,5757,1,4
+6747,alias_default_1274,call_function,alias.default,backward,3,1,1,0,5758,0,3
+6748,convert_element_type_1805,call_function,convert_element_type.default,backward,3,1,1,1,5780,349,8
+6749,convert_element_type_1806,call_function,convert_element_type.default,backward,3,1,1,1,339,349,4
+6750,convert_element_type_1807,call_function,convert_element_type.default,backward,3,1,1,1,3,343,2
+6751,alias_default_1194,call_function,alias.default,backward,3,1,1,2,5781,348,4
+6752,mul_698,call_function,mul.Tensor,backward,3,2,2,1,5783,342,8
+6753,mul_699,call_function,mul.Tensor,backward,3,2,2,1,347,348,8
+6754,alias_default_1195,call_function,alias.default,backward,3,1,1,2,5784,341,4
+6755,alias_default_1196,call_function,alias.default,backward,3,1,1,3,348,347,4
+6756,mul_700,call_function,mul.Tensor,backward,3,2,2,1,5788,340,8
+6757,sum_101,call_function,sum.dim_IntList,backward,3,1,1,1,5789,339,5
+6758,div_78,call_function,div.Tensor,backward,3,1,1,1,349,339,6
+6759,mul_701,call_function,mul.Tensor,backward,3,2,2,1,5791,338,8
+6760,sub_75,call_function,sub.Tensor,backward,3,2,2,1,5792,337,10
+6761,mul_702,call_function,mul.Tensor,backward,3,2,2,1,5793,336,8
+6762,mul_703,call_function,mul.Tensor,backward,3,2,2,1,5785,4,8
+6763,sum_102,call_function,sum.dim_IntList,backward,3,1,1,1,5786,3,5
+6764,convert_element_type_1808,call_function,convert_element_type.default,backward,3,1,1,1,5794,335,6
+6765,convert_element_type_1809,call_function,convert_element_type.default,backward,3,1,1,1,5787,2,3
+6766,add_315,call_function,add.Tensor,unknown,,2,2,1,5795,334,10
+6767,dtype_cast_481,call_function,dtype_cast.default,backward,3,1,1,1,5788,1,3
+6768,alias_default_1281,call_function,alias.default,backward,3,1,1,0,5789,0,2
+6769,alias_default_1197,call_function,alias.default,unknown,,1,1,3,5796,333,4
+6770,einsum_default_549,call_function,einsum.default,backward,2,2,2,1,5797,3,5
+6771,permute_1115,call_function,permute.default,backward,2,1,1,1,4,329,3
+6772,einsum_default_550,call_function,einsum.default,backward,2,2,2,1,5798,328,5
+6773,permute_1116,call_function,permute.default,backward,2,1,1,1,5798,2,4
+6774,dtype_cast_482,call_function,dtype_cast.default,backward,2,1,1,1,5799,1,4
+6775,alias_default_1270,call_function,alias.default,backward,2,1,1,0,5800,0,3
+6776,alias_default_1198,call_function,alias.default,backward,2,1,1,2,5799,327,4
+6777,mul_704,call_function,mul.Tensor,backward,2,2,2,1,5800,315,8
+6778,mul_705,call_function,mul.Tensor,backward,2,2,2,1,5800,319,8
+6779,alias_default_1199,call_function,alias.default,backward,2,1,1,2,5801,314,4
+6780,einsum_default_551,call_function,einsum.default,backward,2,2,2,1,5802,3,5
+6781,permute_1119,call_function,permute.default,backward,2,1,1,1,4,310,3
+6782,einsum_default_552,call_function,einsum.default,backward,2,2,2,1,5803,309,5
+6783,permute_1120,call_function,permute.default,backward,2,1,1,1,5803,2,4
+6784,dtype_cast_483,call_function,dtype_cast.default,backward,2,1,1,1,5804,1,4
+6785,alias_default_1271,call_function,alias.default,backward,2,1,1,0,5805,0,3
+6786,convert_element_type_1818,call_function,convert_element_type.default,backward,2,1,1,1,5801,318,6
+6787,convert_element_type_1819,call_function,convert_element_type.default,backward,2,1,1,1,316,328,4
+6788,alias_default_1200,call_function,alias.default,backward,2,1,1,2,317,327,4
+6789,neg_53,call_function,neg.default,backward,2,1,1,1,318,326,8
+6790,exp_53,call_function,exp.default,backward,2,1,1,1,319,325,6
+6791,add_316,call_function,add.Tensor,backward,2,1,1,1,320,324,4
+6792,reciprocal_25,call_function,reciprocal.default,backward,2,1,1,1,321,323,4
+6793,mul_706,call_function,mul.Tensor,backward,2,1,1,1,322,322,6
+6794,alias_default_1201,call_function,alias.default,backward,2,1,1,2,323,321,4
+6795,mul_707,call_function,mul.Tensor,backward,2,2,2,1,5810,317,8
+6796,sub_76,call_function,sub.Tensor,backward,2,1,1,1,324,319,4
+6797,mul_708,call_function,mul.Tensor,backward,2,2,2,1,325,318,8
+6798,add_317,call_function,add.Tensor,backward,2,1,1,1,326,317,4
+6799,mul_709,call_function,mul.Tensor,backward,2,2,2,1,5814,316,8
+6800,convert_element_type_1820,call_function,convert_element_type.default,backward,2,1,1,1,5815,315,6
+6801,alias_default_1202,call_function,alias.default,backward,2,1,1,2,5816,314,4
+6802,einsum_default_553,call_function,einsum.default,backward,2,2,2,1,5817,3,5
+6803,permute_1123,call_function,permute.default,backward,2,1,1,1,4,310,3
+6804,einsum_default_554,call_function,einsum.default,backward,2,2,2,1,5818,309,5
+6805,add_318,call_function,add.Tensor,unknown,,2,2,1,5823,308,10
+6806,permute_1124,call_function,permute.default,backward,2,1,1,1,5818,2,4
+6807,dtype_cast_484,call_function,dtype_cast.default,backward,2,1,1,1,5819,1,4
+6808,alias_default_1269,call_function,alias.default,backward,2,1,1,0,5820,0,3
+6809,convert_element_type_1825,call_function,convert_element_type.default,backward,2,1,1,1,5824,307,8
+6810,convert_element_type_1826,call_function,convert_element_type.default,backward,2,1,1,1,296,307,4
+6811,convert_element_type_1827,call_function,convert_element_type.default,backward,2,1,1,1,3,301,2
+6812,alias_default_1203,call_function,alias.default,backward,2,1,1,2,5825,306,4
+6813,mul_710,call_function,mul.Tensor,backward,2,2,2,1,5827,300,8
+6814,mul_711,call_function,mul.Tensor,backward,2,2,2,1,304,306,8
+6815,alias_default_1204,call_function,alias.default,backward,2,1,1,2,5828,299,4
+6816,alias_default_1205,call_function,alias.default,backward,2,1,1,3,305,305,4
+6817,mul_712,call_function,mul.Tensor,backward,2,2,2,1,5832,298,8
+6818,sum_103,call_function,sum.dim_IntList,backward,2,1,1,1,5833,297,5
+6819,div_79,call_function,div.Tensor,backward,2,1,1,1,306,297,6
+6820,mul_713,call_function,mul.Tensor,backward,2,2,2,1,5835,296,8
+6821,sub_77,call_function,sub.Tensor,backward,2,2,2,1,5836,295,10
+6822,mul_714,call_function,mul.Tensor,backward,2,2,2,1,5837,294,8
+6823,mul_715,call_function,mul.Tensor,backward,2,2,2,1,5829,4,8
+6824,sum_104,call_function,sum.dim_IntList,backward,2,1,1,1,5830,3,5
+6825,convert_element_type_1828,call_function,convert_element_type.default,backward,2,1,1,1,5838,293,6
+6826,convert_element_type_1829,call_function,convert_element_type.default,backward,2,1,1,1,5831,2,3
+6827,add_319,call_function,add.Tensor,unknown,,2,2,1,5839,292,10
+6828,dtype_cast_485,call_function,dtype_cast.default,backward,2,1,1,1,5832,1,3
+6829,alias_default_1273,call_function,alias.default,backward,2,1,1,0,5833,0,2
+6830,alias_default_1206,call_function,alias.default,unknown,,1,1,3,5840,291,4
+6831,einsum_default_555,call_function,einsum.default,backward,2,2,2,1,5841,3,5
+6832,permute_1127,call_function,permute.default,backward,2,1,1,1,4,287,3
+6833,einsum_default_556,call_function,einsum.default,backward,2,2,2,1,5842,286,5
+6834,permute_1128,call_function,permute.default,backward,2,1,1,1,5842,2,4
+6835,dtype_cast_486,call_function,dtype_cast.default,backward,2,1,1,1,5843,1,4
+6836,alias_default_1268,call_function,alias.default,backward,2,1,1,0,5844,0,3
+6837,view_1206,call_function,view.default,backward,2,1,1,1,5843,285,4
+6838,permute_1129,call_function,permute.default,backward,2,1,1,1,5844,284,4
+6839,_scaled_dot_product_flash_attention_backward_25,call_function,_scaled_dot_product_flash_attention_backward.default,backward,2,8,8,3,5848,283,2
+6840,getitem_327,call_function,getitem,backward,2,1,1,1,5849,256,2
+6841,getitem_328,call_function,getitem,backward,2,1,1,1,5849,257,2
+6842,getitem_329,call_function,getitem,backward,2,1,1,1,5849,250,2
+6843,permute_1130,call_function,permute.default,backward,2,1,1,1,5850,249,2
+6844,permute_1131,call_function,permute.default,backward,2,1,1,1,5850,256,2
+6845,permute_1132,call_function,permute.default,backward,2,1,1,1,5850,255,2
+6846,convert_element_type_1834,call_function,convert_element_type.default,backward,2,1,1,1,5851,255,2
+6847,convert_element_type_1835,call_function,convert_element_type.default,backward,2,1,1,1,5851,254,2
+6848,view_1207,call_function,view.default,backward,2,1,1,1,5852,254,2
+6849,view_as_complex_106,call_function,view_as_complex.default,backward,2,1,1,1,5853,253,6
+6850,_conj_50,call_function,_conj.default,backward,2,1,1,1,4,254,3
+6851,clone_206,call_function,clone.default,backward,2,1,1,1,5,253,3
+6852,mul_716,call_function,mul.Tensor,backward,2,2,2,1,5856,252,8
+6853,view_1208,call_function,view.default,backward,2,1,1,1,5852,253,2
+6854,view_as_complex_107,call_function,view_as_complex.default,backward,2,1,1,1,5853,252,6
+6855,_conj_51,call_function,_conj.default,backward,2,1,1,1,4,253,3
+6856,clone_207,call_function,clone.default,backward,2,1,1,1,5,252,3
+6857,mul_717,call_function,mul.Tensor,backward,2,2,2,1,5856,251,8
+6858,view_as_real_106,call_function,view_as_real.default,backward,2,1,1,1,5857,251,6
+6859,view_1209,call_function,view.default,backward,2,1,1,1,5858,250,6
+6860,convert_element_type_1836,call_function,convert_element_type.default,backward,2,1,1,1,5859,249,6
+6861,view_as_real_107,call_function,view_as_real.default,backward,2,1,1,1,5857,250,6
+6862,view_1210,call_function,view.default,backward,2,1,1,1,5858,249,6
+6863,convert_element_type_1837,call_function,convert_element_type.default,backward,2,1,1,1,5859,248,6
+6864,view_1211,call_function,view.default,backward,2,1,1,1,5851,248,2
+6865,view_1212,call_function,view.default,backward,2,1,1,1,5860,248,5
+6866,view_1213,call_function,view.default,backward,2,1,1,1,5860,247,5
+6867,alias_default_1207,call_function,alias.default,backward,2,1,1,2,5852,247,4
+6868,einsum_default_557,call_function,einsum.default,backward,2,2,2,1,5853,3,5
+6869,permute_1135,call_function,permute.default,backward,2,1,1,1,4,243,3
+6870,einsum_default_558,call_function,einsum.default,backward,2,2,2,1,5854,242,5
+6871,permute_1136,call_function,permute.default,backward,2,1,1,1,5854,2,4
+6872,dtype_cast_487,call_function,dtype_cast.default,backward,2,1,1,1,5855,1,4
+6873,alias_default_1267,call_function,alias.default,backward,2,1,1,0,5856,0,3
+6874,alias_default_1208,call_function,alias.default,backward,2,1,1,2,5861,247,4
+6875,einsum_default_559,call_function,einsum.default,backward,2,2,2,1,5862,3,5
+6876,permute_1139,call_function,permute.default,backward,2,1,1,1,4,243,3
+6877,einsum_default_560,call_function,einsum.default,backward,2,2,2,1,5863,242,5
+6878,add_320,call_function,add.Tensor,unknown,,2,2,1,5870,241,10
+6879,permute_1140,call_function,permute.default,backward,2,1,1,1,5863,2,4
+6880,dtype_cast_488,call_function,dtype_cast.default,backward,2,1,1,1,5864,1,4
+6881,alias_default_1266,call_function,alias.default,backward,2,1,1,0,5865,0,3
+6882,alias_default_1209,call_function,alias.default,backward,2,1,1,2,5861,246,4
+6883,einsum_default_561,call_function,einsum.default,backward,2,2,2,1,5862,3,5
+6884,permute_1143,call_function,permute.default,backward,2,1,1,1,4,242,3
+6885,einsum_default_562,call_function,einsum.default,backward,2,2,2,1,5863,241,5
+6886,add_321,call_function,add.Tensor,unknown,,2,2,1,5886,240,10
+6887,permute_1144,call_function,permute.default,backward,2,1,1,1,5863,2,4
+6888,dtype_cast_489,call_function,dtype_cast.default,backward,2,1,1,1,5864,1,4
+6889,alias_default_1265,call_function,alias.default,backward,2,1,1,0,5865,0,3
+6890,convert_element_type_1850,call_function,convert_element_type.default,backward,2,1,1,1,5887,239,8
+6891,convert_element_type_1851,call_function,convert_element_type.default,backward,2,1,1,1,229,239,4
+6892,convert_element_type_1852,call_function,convert_element_type.default,backward,2,1,1,1,3,233,2
+6893,alias_default_1210,call_function,alias.default,backward,2,1,1,2,5888,238,4
+6894,mul_718,call_function,mul.Tensor,backward,2,2,2,1,5890,232,8
+6895,mul_719,call_function,mul.Tensor,backward,2,2,2,1,237,238,8
+6896,alias_default_1211,call_function,alias.default,backward,2,1,1,2,5891,231,4
+6897,alias_default_1212,call_function,alias.default,backward,2,1,1,3,238,237,4
+6898,mul_720,call_function,mul.Tensor,backward,2,2,2,1,5895,230,8
+6899,sum_105,call_function,sum.dim_IntList,backward,2,1,1,1,5896,229,5
+6900,div_80,call_function,div.Tensor,backward,2,1,1,1,239,229,6
+6901,mul_721,call_function,mul.Tensor,backward,2,2,2,1,5898,228,8
+6902,sub_78,call_function,sub.Tensor,backward,2,2,2,1,5899,227,10
+6903,mul_722,call_function,mul.Tensor,backward,2,2,2,1,5900,226,8
+6904,mul_723,call_function,mul.Tensor,backward,2,2,2,1,5892,4,8
+6905,sum_106,call_function,sum.dim_IntList,backward,2,1,1,1,5893,3,5
+6906,convert_element_type_1853,call_function,convert_element_type.default,backward,2,1,1,1,5901,225,6
+6907,convert_element_type_1854,call_function,convert_element_type.default,backward,2,1,1,1,5894,2,3
+6908,add_322,call_function,add.Tensor,unknown,,2,2,1,5902,224,10
+6909,dtype_cast_490,call_function,dtype_cast.default,backward,2,1,1,1,5895,1,3
+6910,alias_default_1272,call_function,alias.default,backward,2,1,1,0,5896,0,2
+6911,alias_default_1213,call_function,alias.default,unknown,,1,1,3,5903,223,4
+6912,einsum_default_563,call_function,einsum.default,backward,1,2,2,1,5904,3,5
+6913,permute_1147,call_function,permute.default,backward,1,1,1,1,4,219,3
+6914,einsum_default_564,call_function,einsum.default,backward,1,2,2,1,5905,218,5
+6915,permute_1148,call_function,permute.default,backward,1,1,1,1,5905,2,4
+6916,dtype_cast_491,call_function,dtype_cast.default,backward,1,1,1,1,5906,1,4
+6917,alias_default_1261,call_function,alias.default,backward,1,1,1,0,5907,0,3
+6918,alias_default_1214,call_function,alias.default,backward,1,1,1,2,5906,217,4
+6919,mul_724,call_function,mul.Tensor,backward,1,2,2,1,5907,205,8
+6920,mul_725,call_function,mul.Tensor,backward,1,2,2,1,5907,209,8
+6921,alias_default_1215,call_function,alias.default,backward,1,1,1,2,5908,204,4
+6922,einsum_default_565,call_function,einsum.default,backward,1,2,2,1,5909,3,5
+6923,permute_1151,call_function,permute.default,backward,1,1,1,1,4,200,3
+6924,einsum_default_566,call_function,einsum.default,backward,1,2,2,1,5910,199,5
+6925,permute_1152,call_function,permute.default,backward,1,1,1,1,5910,2,4
+6926,dtype_cast_492,call_function,dtype_cast.default,backward,1,1,1,1,5911,1,4
+6927,alias_default_1262,call_function,alias.default,backward,1,1,1,0,5912,0,3
+6928,convert_element_type_1863,call_function,convert_element_type.default,backward,1,1,1,1,5908,208,6
+6929,convert_element_type_1864,call_function,convert_element_type.default,backward,1,1,1,1,206,218,4
+6930,alias_default_1216,call_function,alias.default,backward,1,1,1,2,207,217,4
+6931,neg_54,call_function,neg.default,backward,1,1,1,1,208,216,8
+6932,exp_54,call_function,exp.default,backward,1,1,1,1,209,215,6
+6933,add_323,call_function,add.Tensor,backward,1,1,1,1,210,214,4
+6934,reciprocal_26,call_function,reciprocal.default,backward,1,1,1,1,211,213,4
+6935,mul_726,call_function,mul.Tensor,backward,1,1,1,1,212,212,6
+6936,alias_default_1217,call_function,alias.default,backward,1,1,1,2,213,211,4
+6937,mul_727,call_function,mul.Tensor,backward,1,2,2,1,5917,207,8
+6938,sub_79,call_function,sub.Tensor,backward,1,1,1,1,214,209,4
+6939,mul_728,call_function,mul.Tensor,backward,1,2,2,1,215,208,8
+6940,add_324,call_function,add.Tensor,backward,1,1,1,1,216,207,4
+6941,mul_729,call_function,mul.Tensor,backward,1,2,2,1,5921,206,8
+6942,convert_element_type_1865,call_function,convert_element_type.default,backward,1,1,1,1,5922,205,6
+6943,alias_default_1218,call_function,alias.default,backward,1,1,1,2,5923,204,4
+6944,einsum_default_567,call_function,einsum.default,backward,1,2,2,1,5924,3,5
+6945,permute_1155,call_function,permute.default,backward,1,1,1,1,4,200,3
+6946,einsum_default_568,call_function,einsum.default,backward,1,2,2,1,5925,199,5
+6947,add_325,call_function,add.Tensor,unknown,,2,2,1,5930,198,10
+6948,permute_1156,call_function,permute.default,backward,1,1,1,1,5925,2,4
+6949,dtype_cast_493,call_function,dtype_cast.default,backward,1,1,1,1,5926,1,4
+6950,alias_default_1260,call_function,alias.default,backward,1,1,1,0,5927,0,3
+6951,convert_element_type_1870,call_function,convert_element_type.default,backward,1,1,1,1,5931,197,8
+6952,convert_element_type_1871,call_function,convert_element_type.default,backward,1,1,1,1,186,197,4
+6953,convert_element_type_1872,call_function,convert_element_type.default,backward,1,1,1,1,3,191,2
+6954,alias_default_1219,call_function,alias.default,backward,1,1,1,2,5932,196,4
+6955,mul_730,call_function,mul.Tensor,backward,1,2,2,1,5934,190,8
+6956,mul_731,call_function,mul.Tensor,backward,1,2,2,1,194,196,8
+6957,alias_default_1220,call_function,alias.default,backward,1,1,1,2,5935,189,4
+6958,alias_default_1221,call_function,alias.default,backward,1,1,1,3,195,195,4
+6959,mul_732,call_function,mul.Tensor,backward,1,2,2,1,5939,188,8
+6960,sum_107,call_function,sum.dim_IntList,backward,1,1,1,1,5940,187,5
+6961,div_81,call_function,div.Tensor,backward,1,1,1,1,196,187,6
+6962,mul_733,call_function,mul.Tensor,backward,1,2,2,1,5942,186,8
+6963,sub_80,call_function,sub.Tensor,backward,1,2,2,1,5943,185,10
+6964,mul_734,call_function,mul.Tensor,backward,1,2,2,1,5944,184,8
+6965,mul_735,call_function,mul.Tensor,backward,1,2,2,1,5936,4,8
+6966,sum_108,call_function,sum.dim_IntList,backward,1,1,1,1,5937,3,5
+6967,convert_element_type_1873,call_function,convert_element_type.default,backward,1,1,1,1,5945,183,6
+6968,convert_element_type_1874,call_function,convert_element_type.default,backward,1,1,1,1,5938,2,3
+6969,add_326,call_function,add.Tensor,unknown,,2,2,1,5946,182,10
+6970,dtype_cast_494,call_function,dtype_cast.default,backward,1,1,1,1,5939,1,3
+6971,alias_default_1264,call_function,alias.default,backward,1,1,1,0,5940,0,2
+6972,alias_default_1222,call_function,alias.default,unknown,,1,1,3,5947,181,4
+6973,einsum_default_569,call_function,einsum.default,backward,1,2,2,1,5948,3,5
+6974,permute_1159,call_function,permute.default,backward,1,1,1,1,4,177,3
+6975,einsum_default_570,call_function,einsum.default,backward,1,2,2,1,5949,176,5
+6976,permute_1160,call_function,permute.default,backward,1,1,1,1,5949,2,4
+6977,dtype_cast_495,call_function,dtype_cast.default,backward,1,1,1,1,5950,1,4
+6978,alias_default_1259,call_function,alias.default,backward,1,1,1,0,5951,0,3
+6979,view_1228,call_function,view.default,backward,1,1,1,1,5950,175,4
+6980,permute_1161,call_function,permute.default,backward,1,1,1,1,5951,174,4
+6981,_scaled_dot_product_flash_attention_backward_26,call_function,_scaled_dot_product_flash_attention_backward.default,backward,1,8,8,3,5955,173,2
+6982,getitem_330,call_function,getitem,backward,1,1,1,1,5956,146,2
+6983,getitem_331,call_function,getitem,backward,1,1,1,1,5956,147,2
+6984,getitem_332,call_function,getitem,backward,1,1,1,1,5956,140,2
+6985,permute_1162,call_function,permute.default,backward,1,1,1,1,5957,139,2
+6986,permute_1163,call_function,permute.default,backward,1,1,1,1,5957,146,2
+6987,permute_1164,call_function,permute.default,backward,1,1,1,1,5957,145,2
+6988,convert_element_type_1879,call_function,convert_element_type.default,backward,1,1,1,1,5958,145,2
+6989,convert_element_type_1880,call_function,convert_element_type.default,backward,1,1,1,1,5958,144,2
+6990,view_1229,call_function,view.default,backward,1,1,1,1,5959,144,2
+6991,view_as_complex_108,call_function,view_as_complex.default,backward,1,1,1,1,5960,143,6
+6992,_conj_52,call_function,_conj.default,backward,1,1,1,1,4,144,3
+6993,clone_214,call_function,clone.default,backward,1,1,1,1,5,143,3
+6994,mul_736,call_function,mul.Tensor,backward,1,2,2,1,5963,142,8
+6995,view_1230,call_function,view.default,backward,1,1,1,1,5959,143,2
+6996,view_as_complex_109,call_function,view_as_complex.default,backward,1,1,1,1,5960,142,6
+6997,_conj_53,call_function,_conj.default,backward,1,1,1,1,4,143,3
+6998,clone_215,call_function,clone.default,backward,1,1,1,1,5,142,3
+6999,mul_737,call_function,mul.Tensor,backward,1,2,2,1,5963,141,8
+7000,view_as_real_108,call_function,view_as_real.default,backward,1,1,1,1,5964,141,6
+7001,view_1231,call_function,view.default,backward,1,1,1,1,5965,140,6
+7002,convert_element_type_1881,call_function,convert_element_type.default,backward,1,1,1,1,5966,139,6
+7003,view_as_real_109,call_function,view_as_real.default,backward,1,1,1,1,5964,140,6
+7004,view_1232,call_function,view.default,backward,1,1,1,1,5965,139,6
+7005,convert_element_type_1882,call_function,convert_element_type.default,backward,1,1,1,1,5966,138,6
+7006,view_1233,call_function,view.default,backward,1,1,1,1,5958,138,2
+7007,view_1234,call_function,view.default,backward,1,1,1,1,5967,138,5
+7008,view_1235,call_function,view.default,backward,1,1,1,1,5967,137,5
+7009,alias_default_1223,call_function,alias.default,backward,1,1,1,2,5959,137,4
+7010,einsum_default_571,call_function,einsum.default,backward,1,2,2,1,5960,3,5
+7011,permute_1167,call_function,permute.default,backward,1,1,1,1,4,133,3
+7012,einsum_default_572,call_function,einsum.default,backward,1,2,2,1,5961,132,5
+7013,permute_1168,call_function,permute.default,backward,1,1,1,1,5961,2,4
+7014,dtype_cast_496,call_function,dtype_cast.default,backward,1,1,1,1,5962,1,4
+7015,alias_default_1258,call_function,alias.default,backward,1,1,1,0,5963,0,3
+7016,alias_default_1224,call_function,alias.default,backward,1,1,1,2,5968,137,4
+7017,einsum_default_573,call_function,einsum.default,backward,1,2,2,1,5969,3,5
+7018,permute_1171,call_function,permute.default,backward,1,1,1,1,4,133,3
+7019,einsum_default_574,call_function,einsum.default,backward,1,2,2,1,5970,132,5
+7020,add_327,call_function,add.Tensor,unknown,,2,2,1,5977,131,10
+7021,permute_1172,call_function,permute.default,backward,1,1,1,1,5970,2,4
+7022,dtype_cast_497,call_function,dtype_cast.default,backward,1,1,1,1,5971,1,4
+7023,alias_default_1257,call_function,alias.default,backward,1,1,1,0,5972,0,3
+7024,alias_default_1225,call_function,alias.default,backward,1,1,1,2,5968,136,4
+7025,einsum_default_575,call_function,einsum.default,backward,1,2,2,1,5969,3,5
+7026,permute_1175,call_function,permute.default,backward,1,1,1,1,4,132,3
+7027,einsum_default_576,call_function,einsum.default,backward,1,2,2,1,5970,131,5
+7028,add_328,call_function,add.Tensor,unknown,,2,2,1,5993,130,10
+7029,permute_1176,call_function,permute.default,backward,1,1,1,1,5970,2,4
+7030,dtype_cast_498,call_function,dtype_cast.default,backward,1,1,1,1,5971,1,4
+7031,alias_default_1256,call_function,alias.default,backward,1,1,1,0,5972,0,3
+7032,convert_element_type_1895,call_function,convert_element_type.default,backward,1,1,1,1,5994,129,8
+7033,convert_element_type_1896,call_function,convert_element_type.default,backward,1,1,1,1,119,129,4
+7034,convert_element_type_1897,call_function,convert_element_type.default,backward,1,1,1,1,3,123,2
+7035,alias_default_1226,call_function,alias.default,backward,1,1,1,2,5995,128,4
+7036,mul_738,call_function,mul.Tensor,backward,1,2,2,1,5997,122,8
+7037,mul_739,call_function,mul.Tensor,backward,1,2,2,1,127,128,8
+7038,alias_default_1227,call_function,alias.default,backward,1,1,1,2,5998,121,4
+7039,alias_default_1228,call_function,alias.default,backward,1,1,1,3,128,127,4
+7040,mul_740,call_function,mul.Tensor,backward,1,2,2,1,6002,120,8
+7041,sum_109,call_function,sum.dim_IntList,backward,1,1,1,1,6003,119,5
+7042,div_82,call_function,div.Tensor,backward,1,1,1,1,129,119,6
+7043,mul_741,call_function,mul.Tensor,backward,1,2,2,1,6005,118,8
+7044,sub_81,call_function,sub.Tensor,backward,1,2,2,1,6006,117,10
+7045,mul_742,call_function,mul.Tensor,backward,1,2,2,1,6007,116,8
+7046,mul_743,call_function,mul.Tensor,backward,1,2,2,1,5999,4,8
+7047,sum_110,call_function,sum.dim_IntList,backward,1,1,1,1,6000,3,5
+7048,convert_element_type_1898,call_function,convert_element_type.default,backward,1,1,1,1,6008,115,6
+7049,convert_element_type_1899,call_function,convert_element_type.default,backward,1,1,1,1,6001,2,3
+7050,add_329,call_function,add.Tensor,unknown,,2,2,1,6009,114,10
+7051,dtype_cast_499,call_function,dtype_cast.default,backward,1,1,1,1,6002,1,3
+7052,alias_default_1263,call_function,alias.default,backward,1,1,1,0,6003,0,2
+7053,alias_default_1229,call_function,alias.default,unknown,,1,1,3,6010,113,4
+7054,einsum_default_577,call_function,einsum.default,backward,0,2,2,1,6011,3,5
+7055,permute_1179,call_function,permute.default,backward,0,1,1,1,4,109,3
+7056,einsum_default_578,call_function,einsum.default,backward,0,2,2,1,6012,108,5
+7057,permute_1180,call_function,permute.default,backward,0,1,1,1,6012,2,4
+7058,dtype_cast_500,call_function,dtype_cast.default,backward,0,1,1,1,6013,1,4
+7059,alias_default_1252,call_function,alias.default,backward,0,1,1,0,6014,0,3
+7060,alias_default_1230,call_function,alias.default,backward,0,1,1,2,6013,107,4
+7061,mul_744,call_function,mul.Tensor,backward,0,2,2,1,6014,95,8
+7062,mul_745,call_function,mul.Tensor,backward,0,2,2,1,6014,99,8
+7063,alias_default_1231,call_function,alias.default,backward,0,1,1,2,6015,94,4
+7064,einsum_default_579,call_function,einsum.default,backward,0,2,2,1,6016,3,5
+7065,permute_1183,call_function,permute.default,backward,0,1,1,1,4,90,3
+7066,einsum_default_580,call_function,einsum.default,backward,0,2,2,1,6017,89,5
+7067,permute_1184,call_function,permute.default,backward,0,1,1,1,6017,2,4
+7068,dtype_cast_501,call_function,dtype_cast.default,backward,0,1,1,1,6018,1,4
+7069,alias_default_1253,call_function,alias.default,backward,0,1,1,0,6019,0,3
+7070,convert_element_type_1908,call_function,convert_element_type.default,backward,0,1,1,1,6015,98,6
+7071,convert_element_type_1909,call_function,convert_element_type.default,backward,0,1,1,1,96,108,4
+7072,alias_default_1232,call_function,alias.default,backward,0,1,1,2,97,107,4
+7073,neg_55,call_function,neg.default,backward,0,1,1,1,98,106,8
+7074,exp_55,call_function,exp.default,backward,0,1,1,1,99,105,6
+7075,add_330,call_function,add.Tensor,backward,0,1,1,1,100,104,4
+7076,reciprocal_27,call_function,reciprocal.default,backward,0,1,1,1,101,103,4
+7077,mul_746,call_function,mul.Tensor,backward,0,1,1,1,102,102,6
+7078,alias_default_1233,call_function,alias.default,backward,0,1,1,2,103,101,4
+7079,mul_747,call_function,mul.Tensor,backward,0,2,2,1,6024,97,8
+7080,sub_82,call_function,sub.Tensor,backward,0,1,1,1,104,99,4
+7081,mul_748,call_function,mul.Tensor,backward,0,2,2,1,105,98,8
+7082,add_331,call_function,add.Tensor,backward,0,1,1,1,106,97,4
+7083,mul_749,call_function,mul.Tensor,backward,0,2,2,1,6028,96,8
+7084,convert_element_type_1910,call_function,convert_element_type.default,backward,0,1,1,1,6029,95,6
+7085,alias_default_1234,call_function,alias.default,backward,0,1,1,2,6030,94,4
+7086,einsum_default_581,call_function,einsum.default,backward,0,2,2,1,6031,3,5
+7087,permute_1187,call_function,permute.default,backward,0,1,1,1,4,90,3
+7088,einsum_default_582,call_function,einsum.default,backward,0,2,2,1,6032,89,5
+7089,add_332,call_function,add.Tensor,unknown,,2,2,1,6037,88,10
+7090,permute_1188,call_function,permute.default,backward,0,1,1,1,6032,2,4
+7091,dtype_cast_502,call_function,dtype_cast.default,backward,0,1,1,1,6033,1,4
+7092,alias_default_1251,call_function,alias.default,backward,0,1,1,0,6034,0,3
+7093,convert_element_type_1915,call_function,convert_element_type.default,backward,0,1,1,1,6038,87,8
+7094,convert_element_type_1916,call_function,convert_element_type.default,backward,0,1,1,1,76,87,4
+7095,convert_element_type_1917,call_function,convert_element_type.default,backward,0,1,1,1,3,81,2
+7096,alias_default_1235,call_function,alias.default,backward,0,1,1,2,6039,86,4
+7097,mul_750,call_function,mul.Tensor,backward,0,2,2,1,6041,80,8
+7098,mul_751,call_function,mul.Tensor,backward,0,2,2,1,84,86,8
+7099,alias_default_1236,call_function,alias.default,backward,0,1,1,2,6042,79,4
+7100,alias_default_1237,call_function,alias.default,backward,0,1,1,3,85,85,4
+7101,mul_752,call_function,mul.Tensor,backward,0,2,2,1,6046,78,8
+7102,sum_111,call_function,sum.dim_IntList,backward,0,1,1,1,6047,77,5
+7103,div_83,call_function,div.Tensor,backward,0,1,1,1,86,77,6
+7104,mul_753,call_function,mul.Tensor,backward,0,2,2,1,6049,76,8
+7105,sub_83,call_function,sub.Tensor,backward,0,2,2,1,6050,75,10
+7106,mul_754,call_function,mul.Tensor,backward,0,2,2,1,6051,74,8
+7107,mul_755,call_function,mul.Tensor,backward,0,2,2,1,6043,4,8
+7108,sum_112,call_function,sum.dim_IntList,backward,0,1,1,1,6044,3,5
+7109,convert_element_type_1918,call_function,convert_element_type.default,backward,0,1,1,1,6052,73,6
+7110,convert_element_type_1919,call_function,convert_element_type.default,backward,0,1,1,1,6045,2,3
+7111,add_333,call_function,add.Tensor,unknown,,2,2,1,6053,72,10
+7112,dtype_cast_503,call_function,dtype_cast.default,backward,0,1,1,1,6046,1,3
+7113,alias_default_1255,call_function,alias.default,backward,0,1,1,0,6047,0,2
+7114,alias_default_1238,call_function,alias.default,unknown,,1,1,3,6054,71,4
+7115,einsum_default_583,call_function,einsum.default,backward,0,2,2,1,6055,3,5
+7116,permute_1191,call_function,permute.default,backward,0,1,1,1,4,67,3
+7117,einsum_default_584,call_function,einsum.default,backward,0,2,2,1,6056,66,5
+7118,permute_1192,call_function,permute.default,backward,0,1,1,1,6056,2,4
+7119,dtype_cast_504,call_function,dtype_cast.default,backward,0,1,1,1,6057,1,4
+7120,alias_default_1250,call_function,alias.default,backward,0,1,1,0,6058,0,3
+7121,view_1250,call_function,view.default,backward,0,1,1,1,6057,65,4
+7122,permute_1193,call_function,permute.default,backward,0,1,1,1,6058,64,4
+7123,_scaled_dot_product_flash_attention_backward_27,call_function,_scaled_dot_product_flash_attention_backward.default,backward,0,8,8,3,6062,63,2
+7124,getitem_333,call_function,getitem,backward,0,1,1,1,6063,36,2
+7125,getitem_334,call_function,getitem,backward,0,1,1,1,6063,37,2
+7126,getitem_335,call_function,getitem,backward,0,1,1,1,6063,30,2
+7127,permute_1194,call_function,permute.default,backward,0,1,1,1,6064,29,2
+7128,permute_1195,call_function,permute.default,backward,0,1,1,1,6064,36,2
+7129,permute_1196,call_function,permute.default,backward,0,1,1,1,6064,35,2
+7130,convert_element_type_1924,call_function,convert_element_type.default,backward,0,1,1,1,6065,35,2
+7131,convert_element_type_1925,call_function,convert_element_type.default,backward,0,1,1,1,6065,34,2
+7132,view_1251,call_function,view.default,backward,0,1,1,1,6066,34,2
+7133,view_as_complex_110,call_function,view_as_complex.default,backward,0,1,1,1,6067,33,6
+7134,_conj_54,call_function,_conj.default,backward,0,1,1,1,4,34,3
+7135,clone_222,call_function,clone.default,backward,0,1,1,1,5,33,3
+7136,mul_756,call_function,mul.Tensor,backward,0,2,2,1,6070,32,8
+7137,view_1252,call_function,view.default,backward,0,1,1,1,6066,33,2
+7138,view_as_complex_111,call_function,view_as_complex.default,backward,0,1,1,1,6067,32,6
+7139,_conj_55,call_function,_conj.default,backward,0,1,1,1,4,33,3
+7140,clone_223,call_function,clone.default,backward,0,1,1,1,5,32,3
+7141,mul_757,call_function,mul.Tensor,backward,0,2,2,1,6070,31,8
+7142,view_as_real_110,call_function,view_as_real.default,backward,0,1,1,1,6071,31,6
+7143,view_1253,call_function,view.default,backward,0,1,1,1,6072,30,6
+7144,convert_element_type_1926,call_function,convert_element_type.default,backward,0,1,1,1,6073,29,6
+7145,view_as_real_111,call_function,view_as_real.default,backward,0,1,1,1,6071,30,6
+7146,view_1254,call_function,view.default,backward,0,1,1,1,6072,29,6
+7147,convert_element_type_1927,call_function,convert_element_type.default,backward,0,1,1,1,6073,28,6
+7148,view_1255,call_function,view.default,backward,0,1,1,1,6065,28,2
+7149,view_1256,call_function,view.default,backward,0,1,1,1,6074,28,5
+7150,view_1257,call_function,view.default,backward,0,1,1,1,6074,27,5
+7151,alias_default_1239,call_function,alias.default,backward,0,1,1,2,6066,27,4
+7152,einsum_default_585,call_function,einsum.default,backward,0,2,2,1,6067,3,5
+7153,permute_1199,call_function,permute.default,backward,0,1,1,1,4,23,3
+7154,einsum_default_586,call_function,einsum.default,backward,0,2,2,1,6068,22,5
+7155,permute_1200,call_function,permute.default,backward,0,1,1,1,6068,2,4
+7156,dtype_cast_505,call_function,dtype_cast.default,backward,0,1,1,1,6069,1,4
+7157,alias_default_1249,call_function,alias.default,backward,0,1,1,0,6070,0,3
+7158,alias_default_1240,call_function,alias.default,backward,0,1,1,2,6075,27,4
+7159,einsum_default_587,call_function,einsum.default,backward,0,2,2,1,6076,3,5
+7160,permute_1203,call_function,permute.default,backward,0,1,1,1,4,23,3
+7161,einsum_default_588,call_function,einsum.default,backward,0,2,2,1,6077,22,5
+7162,add_334,call_function,add.Tensor,unknown,,2,2,1,6084,21,10
+7163,permute_1204,call_function,permute.default,backward,0,1,1,1,6077,2,4
+7164,dtype_cast_506,call_function,dtype_cast.default,backward,0,1,1,1,6078,1,4
+7165,alias_default_1248,call_function,alias.default,backward,0,1,1,0,6079,0,3
+7166,alias_default_1241,call_function,alias.default,backward,0,1,1,2,6075,26,4
+7167,einsum_default_589,call_function,einsum.default,backward,0,2,2,1,6076,3,5
+7168,permute_1207,call_function,permute.default,backward,0,1,1,1,4,22,3
+7169,einsum_default_590,call_function,einsum.default,backward,0,2,2,1,6077,21,5
+7170,add_335,call_function,add.Tensor,unknown,,2,2,1,6100,20,10
+7171,permute_1208,call_function,permute.default,backward,0,1,1,1,6077,2,4
+7172,dtype_cast_507,call_function,dtype_cast.default,backward,0,1,1,1,6078,1,4
+7173,alias_default_1247,call_function,alias.default,backward,0,1,1,0,6079,0,3
+7174,convert_element_type_1940,call_function,convert_element_type.default,backward,0,1,1,1,6101,19,8
+7175,convert_element_type_1941,call_function,convert_element_type.default,backward,0,1,1,1,7,19,4
+7176,convert_element_type_1942,call_function,convert_element_type.default,backward,0,1,1,1,3,13,2
+7177,alias_default_1242,call_function,alias.default,backward,0,1,1,2,6102,18,4
+7178,mul_758,call_function,mul.Tensor,backward,0,2,2,1,6104,12,8
+7179,mul_759,call_function,mul.Tensor,backward,0,2,2,1,15,18,8
+7180,alias_default_1243,call_function,alias.default,backward,0,1,1,2,6105,11,4
+7181,alias_default_1244,call_function,alias.default,backward,0,1,1,3,16,17,4
+7182,mul_760,call_function,mul.Tensor,backward,0,2,2,1,6109,10,8
+7183,sum_113,call_function,sum.dim_IntList,backward,0,1,1,1,6110,9,5
+7184,div_84,call_function,div.Tensor,backward,0,1,1,1,17,9,6
+7185,mul_761,call_function,mul.Tensor,backward,0,2,2,1,6112,8,8
+7186,sub_84,call_function,sub.Tensor,backward,0,2,2,1,6113,7,10
+7187,mul_762,call_function,mul.Tensor,backward,0,2,2,1,6114,6,8
+7188,mul_763,call_function,mul.Tensor,backward,0,2,2,1,6106,4,8
+7189,sum_114,call_function,sum.dim_IntList,backward,0,1,1,1,6107,3,5
+7190,convert_element_type_1943,call_function,convert_element_type.default,backward,0,1,1,1,6115,5,6
+7191,convert_element_type_1944,call_function,convert_element_type.default,backward,0,1,1,1,6108,2,3
+7192,add_336,call_function,add.Tensor,unknown,,2,2,1,6116,4,10
+7193,dtype_cast_508,call_function,dtype_cast.default,backward,0,1,1,1,6109,1,3
+7194,alias_default_1254,call_function,alias.default,backward,0,1,1,0,6110,0,2
+7195,embedding_dense_backward,call_function,embedding_dense_backward.default,backward,,2,2,1,6117,3,5
+7196,dtype_cast_509,call_function,dtype_cast.default,backward,,1,1,1,6118,2,3
+7197,add_337,call_function,add.Tensor,unknown,,2,2,1,6126,1,9
+7198,alias_default_1246,call_function,alias.default,unknown,,1,1,0,6127,0,3
diff --git a/profile_results/real_llama3_3b_dag_summary.json b/profile_results/real_llama3_3b_dag_summary.json
new file mode 100644
index 00000000..93434ea9
--- /dev/null
+++ b/profile_results/real_llama3_3b_dag_summary.json
@@ -0,0 +1,883 @@
+{
+  "branch_points": 1301,
+  "dag_edges": 8805,
+  "direct_dependency_histogram": {
+    "0": 257,
+    "1": 5275,
+    "2": 1611,
+    "3": 28,
+    "8": 28
+  },
+  "direct_offspring_histogram": {
+    "0": 255,
+    "1": 5643,
+    "2": 934,
+    "3": 254,
+    "4": 84,
+    "6": 28,
+    "28": 1
+  },
+  "ilp_nodes": 7199,
+  "max_ancestor_count": 6127,
+  "max_descendant_count": 5943,
+  "max_direct_dependency_nodes": 8,
+  "max_direct_offspring_nodes": 28,
+  "merge_points": 1667,
+  "merge_points_csv": "profile_results/real_llama3_3b_merge_points.csv",
+  "mesh": "1D 64",
+  "model": "LLaMA3 3B",
+  "node_stats_csv": "profile_results/real_llama3_3b_dag_node_stats.csv",
+  "top_fanout_points": [
+    {
+      "ancestor_count": 1,
+      "descendant_count": 5942,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 28,
+      "idx": 296,
+      "layer": "",
+      "name": "alias_default_1",
+      "op": "call_function",
+      "phase": "unknown",
+      "strategy_count": 3,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 20,
+      "descendant_count": 5788,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 276,
+      "layer": 0,
+      "name": "alias_default_8",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 132,
+      "descendant_count": 5692,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 381,
+      "layer": 1,
+      "name": "alias_default_36",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 242,
+      "descendant_count": 5596,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 485,
+      "layer": 2,
+      "name": "alias_default_64",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 352,
+      "descendant_count": 5500,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 589,
+      "layer": 3,
+      "name": "alias_default_92",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 462,
+      "descendant_count": 5404,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 693,
+      "layer": 4,
+      "name": "alias_default_120",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 572,
+      "descendant_count": 5308,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 797,
+      "layer": 5,
+      "name": "alias_default_148",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 682,
+      "descendant_count": 5212,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 901,
+      "layer": 6,
+      "name": "alias_default_176",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 792,
+      "descendant_count": 5116,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 1005,
+      "layer": 7,
+      "name": "alias_default_204",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 902,
+      "descendant_count": 5020,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 1109,
+      "layer": 8,
+      "name": "alias_default_232",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 1012,
+      "descendant_count": 4924,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 1213,
+      "layer": 9,
+      "name": "alias_default_260",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 1122,
+      "descendant_count": 4828,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 1317,
+      "layer": 10,
+      "name": "alias_default_288",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 1232,
+      "descendant_count": 4732,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 1421,
+      "layer": 11,
+      "name": "alias_default_316",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 1342,
+      "descendant_count": 4636,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 1525,
+      "layer": 12,
+      "name": "alias_default_344",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 1452,
+      "descendant_count": 4540,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 1629,
+      "layer": 13,
+      "name": "alias_default_372",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 1562,
+      "descendant_count": 4444,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 1733,
+      "layer": 14,
+      "name": "alias_default_400",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 1672,
+      "descendant_count": 4348,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 1837,
+      "layer": 15,
+      "name": "alias_default_428",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 1782,
+      "descendant_count": 4252,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 1941,
+      "layer": 16,
+      "name": "alias_default_456",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 1892,
+      "descendant_count": 4156,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 2045,
+      "layer": 17,
+      "name": "alias_default_484",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 2002,
+      "descendant_count": 4060,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 2149,
+      "layer": 18,
+      "name": "alias_default_512",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 2112,
+      "descendant_count": 3964,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 2253,
+      "layer": 19,
+      "name": "alias_default_540",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 2222,
+      "descendant_count": 3868,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 2357,
+      "layer": 20,
+      "name": "alias_default_568",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 2332,
+      "descendant_count": 3772,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 2461,
+      "layer": 21,
+      "name": "alias_default_596",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 2442,
+      "descendant_count": 3676,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 2565,
+      "layer": 22,
+      "name": "alias_default_624",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 2552,
+      "descendant_count": 3580,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 2669,
+      "layer": 23,
+      "name": "alias_default_652",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 2662,
+      "descendant_count": 3484,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 2773,
+      "layer": 24,
+      "name": "alias_default_680",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 2772,
+      "descendant_count": 3388,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 2877,
+      "layer": 25,
+      "name": "alias_default_708",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 2882,
+      "descendant_count": 3292,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 2981,
+      "layer": 26,
+      "name": "alias_default_736",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 2992,
+      "descendant_count": 3196,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 6,
+      "idx": 3085,
+      "layer": 27,
+      "name": "alias_default_764",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 4,
+      "target": "alias.default"
+    },
+    {
+      "ancestor_count": 3,
+      "descendant_count": 5778,
+      "direct_dependency_args": 1,
+      "direct_dependency_nodes": 1,
+      "direct_offspring_nodes": 4,
+      "idx": 298,
+      "layer": 0,
+      "name": "alias_default_12",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 3,
+      "target": "alias.default"
+    }
+  ],
+  "top_merge_points": [
+    {
+      "ancestor_count": 3173,
+      "descendant_count": 3033,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 3289,
+      "layer": 27,
+      "name": "_scaled_dot_product_flash_attention_backward",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 3280,
+      "descendant_count": 2923,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 3431,
+      "layer": 26,
+      "name": "_scaled_dot_product_flash_attention_backward_1",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 3387,
+      "descendant_count": 2813,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 3573,
+      "layer": 25,
+      "name": "_scaled_dot_product_flash_attention_backward_2",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 3494,
+      "descendant_count": 2703,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 3715,
+      "layer": 24,
+      "name": "_scaled_dot_product_flash_attention_backward_3",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 3601,
+      "descendant_count": 2593,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 3857,
+      "layer": 23,
+      "name": "_scaled_dot_product_flash_attention_backward_4",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 3708,
+      "descendant_count": 2483,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 3999,
+      "layer": 22,
+      "name": "_scaled_dot_product_flash_attention_backward_5",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 3815,
+      "descendant_count": 2373,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 4141,
+      "layer": 21,
+      "name": "_scaled_dot_product_flash_attention_backward_6",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 3922,
+      "descendant_count": 2263,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 4283,
+      "layer": 20,
+      "name": "_scaled_dot_product_flash_attention_backward_7",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 4029,
+      "descendant_count": 2153,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 4425,
+      "layer": 19,
+      "name": "_scaled_dot_product_flash_attention_backward_8",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 4136,
+      "descendant_count": 2043,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 4567,
+      "layer": 18,
+      "name": "_scaled_dot_product_flash_attention_backward_9",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 4243,
+      "descendant_count": 1933,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 4709,
+      "layer": 17,
+      "name": "_scaled_dot_product_flash_attention_backward_10",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 4350,
+      "descendant_count": 1823,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 4851,
+      "layer": 16,
+      "name": "_scaled_dot_product_flash_attention_backward_11",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 4457,
+      "descendant_count": 1713,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 4993,
+      "layer": 15,
+      "name": "_scaled_dot_product_flash_attention_backward_12",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 4564,
+      "descendant_count": 1603,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 5135,
+      "layer": 14,
+      "name": "_scaled_dot_product_flash_attention_backward_13",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 4671,
+      "descendant_count": 1493,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 5277,
+      "layer": 13,
+      "name": "_scaled_dot_product_flash_attention_backward_14",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 4778,
+      "descendant_count": 1383,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 5419,
+      "layer": 12,
+      "name": "_scaled_dot_product_flash_attention_backward_15",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 4885,
+      "descendant_count": 1273,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 5561,
+      "layer": 11,
+      "name": "_scaled_dot_product_flash_attention_backward_16",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 4992,
+      "descendant_count": 1163,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 5703,
+      "layer": 10,
+      "name": "_scaled_dot_product_flash_attention_backward_17",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 5099,
+      "descendant_count": 1053,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 5845,
+      "layer": 9,
+      "name": "_scaled_dot_product_flash_attention_backward_18",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 5206,
+      "descendant_count": 943,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 5987,
+      "layer": 8,
+      "name": "_scaled_dot_product_flash_attention_backward_19",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 5313,
+      "descendant_count": 833,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 6129,
+      "layer": 7,
+      "name": "_scaled_dot_product_flash_attention_backward_20",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 5420,
+      "descendant_count": 723,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 6271,
+      "layer": 6,
+      "name": "_scaled_dot_product_flash_attention_backward_21",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 5527,
+      "descendant_count": 613,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 6413,
+      "layer": 5,
+      "name": "_scaled_dot_product_flash_attention_backward_22",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 5634,
+      "descendant_count": 503,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 6555,
+      "layer": 4,
+      "name": "_scaled_dot_product_flash_attention_backward_23",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 5741,
+      "descendant_count": 393,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 6697,
+      "layer": 3,
+      "name": "_scaled_dot_product_flash_attention_backward_24",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 5848,
+      "descendant_count": 283,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 6839,
+      "layer": 2,
+      "name": "_scaled_dot_product_flash_attention_backward_25",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 5955,
+      "descendant_count": 173,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 6981,
+      "layer": 1,
+      "name": "_scaled_dot_product_flash_attention_backward_26",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 6062,
+      "descendant_count": 63,
+      "direct_dependency_args": 8,
+      "direct_dependency_nodes": 8,
+      "direct_offspring_nodes": 3,
+      "idx": 7123,
+      "layer": 0,
+      "name": "_scaled_dot_product_flash_attention_backward_27",
+      "op": "call_function",
+      "phase": "backward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention_backward.default"
+    },
+    {
+      "ancestor_count": 63,
+      "descendant_count": 5761,
+      "direct_dependency_args": 3,
+      "direct_dependency_nodes": 3,
+      "direct_offspring_nodes": 4,
+      "idx": 313,
+      "layer": 0,
+      "name": "_scaled_dot_product_flash_attention",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention.default"
+    },
+    {
+      "ancestor_count": 173,
+      "descendant_count": 5665,
+      "direct_dependency_args": 3,
+      "direct_dependency_nodes": 3,
+      "direct_offspring_nodes": 4,
+      "idx": 417,
+      "layer": 1,
+      "name": "_scaled_dot_product_flash_attention_1",
+      "op": "call_function",
+      "phase": "forward",
+      "strategy_count": 2,
+      "target": "_scaled_dot_product_flash_attention.default"
+    }
+  ],
+  "trace_and_optimizer_build_s": 38.44014171184972,
+  "treewidth_upper_bounds": {
+    "moralized_edges": 11200,
+    "moralized_min_degree": 10,
+    "moralized_min_fill": 8,
+    "undirected_edges": 8805,
+    "undirected_min_degree": 9,
+    "undirected_min_fill": 6
+  }
+}
\ No newline at end of file
diff --git a/profile_results/real_llama3_3b_merge_points.csv b/profile_results/real_llama3_3b_merge_points.csv
new file mode 100644
index 00000000..4418765e
--- /dev/null
+++ b/profile_results/real_llama3_3b_merge_points.csv
@@ -0,0 +1,1668 @@
+idx,name,op,target,phase,layer,direct_dependency_args,direct_dependency_nodes,direct_offspring_nodes,ancestor_count,descendant_count,strategy_count
+3289,_scaled_dot_product_flash_attention_backward,call_function,_scaled_dot_product_flash_attention_backward.default,backward,27,8,8,3,3173,3033,2
+3431,_scaled_dot_product_flash_attention_backward_1,call_function,_scaled_dot_product_flash_attention_backward.default,backward,26,8,8,3,3280,2923,2
+3573,_scaled_dot_product_flash_attention_backward_2,call_function,_scaled_dot_product_flash_attention_backward.default,backward,25,8,8,3,3387,2813,2
+3715,_scaled_dot_product_flash_attention_backward_3,call_function,_scaled_dot_product_flash_attention_backward.default,backward,24,8,8,3,3494,2703,2
+3857,_scaled_dot_product_flash_attention_backward_4,call_function,_scaled_dot_product_flash_attention_backward.default,backward,23,8,8,3,3601,2593,2
+3999,_scaled_dot_product_flash_attention_backward_5,call_function,_scaled_dot_product_flash_attention_backward.default,backward,22,8,8,3,3708,2483,2
+4141,_scaled_dot_product_flash_attention_backward_6,call_function,_scaled_dot_product_flash_attention_backward.default,backward,21,8,8,3,3815,2373,2
+4283,_scaled_dot_product_flash_attention_backward_7,call_function,_scaled_dot_product_flash_attention_backward.default,backward,20,8,8,3,3922,2263,2
+4425,_scaled_dot_product_flash_attention_backward_8,call_function,_scaled_dot_product_flash_attention_backward.default,backward,19,8,8,3,4029,2153,2
+4567,_scaled_dot_product_flash_attention_backward_9,call_function,_scaled_dot_product_flash_attention_backward.default,backward,18,8,8,3,4136,2043,2
+4709,_scaled_dot_product_flash_attention_backward_10,call_function,_scaled_dot_product_flash_attention_backward.default,backward,17,8,8,3,4243,1933,2
+4851,_scaled_dot_product_flash_attention_backward_11,call_function,_scaled_dot_product_flash_attention_backward.default,backward,16,8,8,3,4350,1823,2
+4993,_scaled_dot_product_flash_attention_backward_12,call_function,_scaled_dot_product_flash_attention_backward.default,backward,15,8,8,3,4457,1713,2
+5135,_scaled_dot_product_flash_attention_backward_13,call_function,_scaled_dot_product_flash_attention_backward.default,backward,14,8,8,3,4564,1603,2
+5277,_scaled_dot_product_flash_attention_backward_14,call_function,_scaled_dot_product_flash_attention_backward.default,backward,13,8,8,3,4671,1493,2
+5419,_scaled_dot_product_flash_attention_backward_15,call_function,_scaled_dot_product_flash_attention_backward.default,backward,12,8,8,3,4778,1383,2
+5561,_scaled_dot_product_flash_attention_backward_16,call_function,_scaled_dot_product_flash_attention_backward.default,backward,11,8,8,3,4885,1273,2
+5703,_scaled_dot_product_flash_attention_backward_17,call_function,_scaled_dot_product_flash_attention_backward.default,backward,10,8,8,3,4992,1163,2
+5845,_scaled_dot_product_flash_attention_backward_18,call_function,_scaled_dot_product_flash_attention_backward.default,backward,9,8,8,3,5099,1053,2
+5987,_scaled_dot_product_flash_attention_backward_19,call_function,_scaled_dot_product_flash_attention_backward.default,backward,8,8,8,3,5206,943,2
+6129,_scaled_dot_product_flash_attention_backward_20,call_function,_scaled_dot_product_flash_attention_backward.default,backward,7,8,8,3,5313,833,2
+6271,_scaled_dot_product_flash_attention_backward_21,call_function,_scaled_dot_product_flash_attention_backward.default,backward,6,8,8,3,5420,723,2
+6413,_scaled_dot_product_flash_attention_backward_22,call_function,_scaled_dot_product_flash_attention_backward.default,backward,5,8,8,3,5527,613,2
+6555,_scaled_dot_product_flash_attention_backward_23,call_function,_scaled_dot_product_flash_attention_backward.default,backward,4,8,8,3,5634,503,2
+6697,_scaled_dot_product_flash_attention_backward_24,call_function,_scaled_dot_product_flash_attention_backward.default,backward,3,8,8,3,5741,393,2
+6839,_scaled_dot_product_flash_attention_backward_25,call_function,_scaled_dot_product_flash_attention_backward.default,backward,2,8,8,3,5848,283,2
+6981,_scaled_dot_product_flash_attention_backward_26,call_function,_scaled_dot_product_flash_attention_backward.default,backward,1,8,8,3,5955,173,2
+7123,_scaled_dot_product_flash_attention_backward_27,call_function,_scaled_dot_product_flash_attention_backward.default,backward,0,8,8,3,6062,63,2
+313,_scaled_dot_product_flash_attention,call_function,_scaled_dot_product_flash_attention.default,forward,0,3,3,4,63,5761,2
+417,_scaled_dot_product_flash_attention_1,call_function,_scaled_dot_product_flash_attention.default,forward,1,3,3,4,173,5665,2
+521,_scaled_dot_product_flash_attention_2,call_function,_scaled_dot_product_flash_attention.default,forward,2,3,3,4,283,5569,2
+625,_scaled_dot_product_flash_attention_3,call_function,_scaled_dot_product_flash_attention.default,forward,3,3,3,4,393,5473,2
+729,_scaled_dot_product_flash_attention_4,call_function,_scaled_dot_product_flash_attention.default,forward,4,3,3,4,503,5377,2
+833,_scaled_dot_product_flash_attention_5,call_function,_scaled_dot_product_flash_attention.default,forward,5,3,3,4,613,5281,2
+937,_scaled_dot_product_flash_attention_6,call_function,_scaled_dot_product_flash_attention.default,forward,6,3,3,4,723,5185,2
+1041,_scaled_dot_product_flash_attention_7,call_function,_scaled_dot_product_flash_attention.default,forward,7,3,3,4,833,5089,2
+1145,_scaled_dot_product_flash_attention_8,call_function,_scaled_dot_product_flash_attention.default,forward,8,3,3,4,943,4993,2
+1249,_scaled_dot_product_flash_attention_9,call_function,_scaled_dot_product_flash_attention.default,forward,9,3,3,4,1053,4897,2
+1353,_scaled_dot_product_flash_attention_10,call_function,_scaled_dot_product_flash_attention.default,forward,10,3,3,4,1163,4801,2
+1457,_scaled_dot_product_flash_attention_11,call_function,_scaled_dot_product_flash_attention.default,forward,11,3,3,4,1273,4705,2
+1561,_scaled_dot_product_flash_attention_12,call_function,_scaled_dot_product_flash_attention.default,forward,12,3,3,4,1383,4609,2
+1665,_scaled_dot_product_flash_attention_13,call_function,_scaled_dot_product_flash_attention.default,forward,13,3,3,4,1493,4513,2
+1769,_scaled_dot_product_flash_attention_14,call_function,_scaled_dot_product_flash_attention.default,forward,14,3,3,4,1603,4417,2
+1873,_scaled_dot_product_flash_attention_15,call_function,_scaled_dot_product_flash_attention.default,forward,15,3,3,4,1713,4321,2
+1977,_scaled_dot_product_flash_attention_16,call_function,_scaled_dot_product_flash_attention.default,forward,16,3,3,4,1823,4225,2
+2081,_scaled_dot_product_flash_attention_17,call_function,_scaled_dot_product_flash_attention.default,forward,17,3,3,4,1933,4129,2
+2185,_scaled_dot_product_flash_attention_18,call_function,_scaled_dot_product_flash_attention.default,forward,18,3,3,4,2043,4033,2
+2289,_scaled_dot_product_flash_attention_19,call_function,_scaled_dot_product_flash_attention.default,forward,19,3,3,4,2153,3937,2
+2393,_scaled_dot_product_flash_attention_20,call_function,_scaled_dot_product_flash_attention.default,forward,20,3,3,4,2263,3841,2
+2497,_scaled_dot_product_flash_attention_21,call_function,_scaled_dot_product_flash_attention.default,forward,21,3,3,4,2373,3745,2
+2601,_scaled_dot_product_flash_attention_22,call_function,_scaled_dot_product_flash_attention.default,forward,22,3,3,4,2483,3649,2
+2705,_scaled_dot_product_flash_attention_23,call_function,_scaled_dot_product_flash_attention.default,forward,23,3,3,4,2593,3553,2
+2809,_scaled_dot_product_flash_attention_24,call_function,_scaled_dot_product_flash_attention.default,forward,24,3,3,4,2703,3457,2
+2913,_scaled_dot_product_flash_attention_25,call_function,_scaled_dot_product_flash_attention.default,forward,25,3,3,4,2813,3361,2
+3017,_scaled_dot_product_flash_attention_26,call_function,_scaled_dot_product_flash_attention.default,forward,26,3,3,4,2923,3265,2
+3121,_scaled_dot_product_flash_attention_27,call_function,_scaled_dot_product_flash_attention.default,forward,27,3,3,4,3033,3169,2
+260,embedding,call_function,embedding.default,forward,,2,2,1,5,5804,5
+270,mul,call_function,mul.Tensor,forward,0,2,2,1,14,5791,8
+272,mul_1,call_function,mul.Tensor,forward,0,2,2,1,18,5790,8
+278,einsum_default,call_function,einsum.default,forward,0,2,2,1,25,5772,5
+282,einsum_default_1,call_function,einsum.default,forward,0,2,2,1,25,5772,5
+299,mul_2,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8
+302,mul_3,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8
+286,einsum_default_2,call_function,einsum.default,forward,0,2,2,1,25,5765,5
+325,einsum_default_3,call_function,einsum.default,forward,0,2,2,1,73,5752,5
+326,add_1,call_function,add.Tensor,forward,0,2,2,1,74,5751,10
+336,mul_4,call_function,mul.Tensor,forward,0,2,2,1,83,5738,8
+338,mul_5,call_function,mul.Tensor,forward,0,2,2,1,87,5737,8
+344,einsum_default_4,call_function,einsum.default,forward,0,2,2,1,94,5732,5
+351,div,call_function,div.Tensor,forward,0,2,2,1,101,5714,6
+356,einsum_default_5,call_function,einsum.default,forward,0,2,2,1,94,5713,5
+359,mul_6,call_function,mul.Tensor,forward,0,2,2,1,110,5711,8
+364,einsum_default_6,call_function,einsum.default,forward,0,2,2,1,116,5709,5
+365,add_4,call_function,add.Tensor,forward,0,2,2,1,117,5708,10
+375,mul_7,call_function,mul.Tensor,forward,1,2,2,1,126,5695,8
+377,mul_8,call_function,mul.Tensor,forward,1,2,2,1,130,5694,8
+383,einsum_default_7,call_function,einsum.default,forward,1,2,2,1,137,5676,5
+387,einsum_default_8,call_function,einsum.default,forward,1,2,2,1,137,5676,5
+403,mul_9,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8
+406,mul_10,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8
+391,einsum_default_9,call_function,einsum.default,forward,1,2,2,1,137,5669,5
+429,einsum_default_10,call_function,einsum.default,forward,1,2,2,1,183,5656,5
+430,add_6,call_function,add.Tensor,forward,1,2,2,1,184,5655,10
+440,mul_11,call_function,mul.Tensor,forward,1,2,2,1,193,5642,8
+442,mul_12,call_function,mul.Tensor,forward,1,2,2,1,197,5641,8
+448,einsum_default_11,call_function,einsum.default,forward,1,2,2,1,204,5636,5
+455,div_1,call_function,div.Tensor,forward,1,2,2,1,211,5618,6
+460,einsum_default_12,call_function,einsum.default,forward,1,2,2,1,204,5617,5
+463,mul_13,call_function,mul.Tensor,forward,1,2,2,1,220,5615,8
+468,einsum_default_13,call_function,einsum.default,forward,1,2,2,1,226,5613,5
+469,add_9,call_function,add.Tensor,forward,1,2,2,1,227,5612,10
+479,mul_14,call_function,mul.Tensor,forward,2,2,2,1,236,5599,8
+481,mul_15,call_function,mul.Tensor,forward,2,2,2,1,240,5598,8
+487,einsum_default_14,call_function,einsum.default,forward,2,2,2,1,247,5580,5
+491,einsum_default_15,call_function,einsum.default,forward,2,2,2,1,247,5580,5
+507,mul_16,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8
+510,mul_17,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8
+495,einsum_default_16,call_function,einsum.default,forward,2,2,2,1,247,5573,5
+533,einsum_default_17,call_function,einsum.default,forward,2,2,2,1,293,5560,5
+534,add_11,call_function,add.Tensor,forward,2,2,2,1,294,5559,10
+544,mul_18,call_function,mul.Tensor,forward,2,2,2,1,303,5546,8
+546,mul_19,call_function,mul.Tensor,forward,2,2,2,1,307,5545,8
+552,einsum_default_18,call_function,einsum.default,forward,2,2,2,1,314,5540,5
+559,div_2,call_function,div.Tensor,forward,2,2,2,1,321,5522,6
+564,einsum_default_19,call_function,einsum.default,forward,2,2,2,1,314,5521,5
+567,mul_20,call_function,mul.Tensor,forward,2,2,2,1,330,5519,8
+572,einsum_default_20,call_function,einsum.default,forward,2,2,2,1,336,5517,5
+573,add_14,call_function,add.Tensor,forward,2,2,2,1,337,5516,10
+583,mul_21,call_function,mul.Tensor,forward,3,2,2,1,346,5503,8
+585,mul_22,call_function,mul.Tensor,forward,3,2,2,1,350,5502,8
+591,einsum_default_21,call_function,einsum.default,forward,3,2,2,1,357,5484,5
+595,einsum_default_22,call_function,einsum.default,forward,3,2,2,1,357,5484,5
+611,mul_23,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8
+614,mul_24,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8
+599,einsum_default_23,call_function,einsum.default,forward,3,2,2,1,357,5477,5
+637,einsum_default_24,call_function,einsum.default,forward,3,2,2,1,403,5464,5
+638,add_16,call_function,add.Tensor,forward,3,2,2,1,404,5463,10
+648,mul_25,call_function,mul.Tensor,forward,3,2,2,1,413,5450,8
+650,mul_26,call_function,mul.Tensor,forward,3,2,2,1,417,5449,8
+656,einsum_default_25,call_function,einsum.default,forward,3,2,2,1,424,5444,5
+663,div_3,call_function,div.Tensor,forward,3,2,2,1,431,5426,6
+668,einsum_default_26,call_function,einsum.default,forward,3,2,2,1,424,5425,5
+671,mul_27,call_function,mul.Tensor,forward,3,2,2,1,440,5423,8
+676,einsum_default_27,call_function,einsum.default,forward,3,2,2,1,446,5421,5
+677,add_19,call_function,add.Tensor,forward,3,2,2,1,447,5420,10
+687,mul_28,call_function,mul.Tensor,forward,4,2,2,1,456,5407,8
+689,mul_29,call_function,mul.Tensor,forward,4,2,2,1,460,5406,8
+695,einsum_default_28,call_function,einsum.default,forward,4,2,2,1,467,5388,5
+699,einsum_default_29,call_function,einsum.default,forward,4,2,2,1,467,5388,5
+715,mul_30,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8
+718,mul_31,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8
+703,einsum_default_30,call_function,einsum.default,forward,4,2,2,1,467,5381,5
+741,einsum_default_31,call_function,einsum.default,forward,4,2,2,1,513,5368,5
+742,add_21,call_function,add.Tensor,forward,4,2,2,1,514,5367,10
+752,mul_32,call_function,mul.Tensor,forward,4,2,2,1,523,5354,8
+754,mul_33,call_function,mul.Tensor,forward,4,2,2,1,527,5353,8
+760,einsum_default_32,call_function,einsum.default,forward,4,2,2,1,534,5348,5
+767,div_4,call_function,div.Tensor,forward,4,2,2,1,541,5330,6
+772,einsum_default_33,call_function,einsum.default,forward,4,2,2,1,534,5329,5
+775,mul_34,call_function,mul.Tensor,forward,4,2,2,1,550,5327,8
+780,einsum_default_34,call_function,einsum.default,forward,4,2,2,1,556,5325,5
+781,add_24,call_function,add.Tensor,forward,4,2,2,1,557,5324,10
+791,mul_35,call_function,mul.Tensor,forward,5,2,2,1,566,5311,8
+793,mul_36,call_function,mul.Tensor,forward,5,2,2,1,570,5310,8
+799,einsum_default_35,call_function,einsum.default,forward,5,2,2,1,577,5292,5
+803,einsum_default_36,call_function,einsum.default,forward,5,2,2,1,577,5292,5
+819,mul_37,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8
+822,mul_38,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8
+807,einsum_default_37,call_function,einsum.default,forward,5,2,2,1,577,5285,5
+845,einsum_default_38,call_function,einsum.default,forward,5,2,2,1,623,5272,5
+846,add_26,call_function,add.Tensor,forward,5,2,2,1,624,5271,10
+856,mul_39,call_function,mul.Tensor,forward,5,2,2,1,633,5258,8
+858,mul_40,call_function,mul.Tensor,forward,5,2,2,1,637,5257,8
+864,einsum_default_39,call_function,einsum.default,forward,5,2,2,1,644,5252,5
+871,div_5,call_function,div.Tensor,forward,5,2,2,1,651,5234,6
+876,einsum_default_40,call_function,einsum.default,forward,5,2,2,1,644,5233,5
+879,mul_41,call_function,mul.Tensor,forward,5,2,2,1,660,5231,8
+884,einsum_default_41,call_function,einsum.default,forward,5,2,2,1,666,5229,5
+885,add_29,call_function,add.Tensor,forward,5,2,2,1,667,5228,10
+895,mul_42,call_function,mul.Tensor,forward,6,2,2,1,676,5215,8
+897,mul_43,call_function,mul.Tensor,forward,6,2,2,1,680,5214,8
+903,einsum_default_42,call_function,einsum.default,forward,6,2,2,1,687,5196,5
+907,einsum_default_43,call_function,einsum.default,forward,6,2,2,1,687,5196,5
+923,mul_44,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8
+926,mul_45,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8
+911,einsum_default_44,call_function,einsum.default,forward,6,2,2,1,687,5189,5
+949,einsum_default_45,call_function,einsum.default,forward,6,2,2,1,733,5176,5
+950,add_31,call_function,add.Tensor,forward,6,2,2,1,734,5175,10
+960,mul_46,call_function,mul.Tensor,forward,6,2,2,1,743,5162,8
+962,mul_47,call_function,mul.Tensor,forward,6,2,2,1,747,5161,8
+968,einsum_default_46,call_function,einsum.default,forward,6,2,2,1,754,5156,5
+975,div_6,call_function,div.Tensor,forward,6,2,2,1,761,5138,6
+980,einsum_default_47,call_function,einsum.default,forward,6,2,2,1,754,5137,5
+983,mul_48,call_function,mul.Tensor,forward,6,2,2,1,770,5135,8
+988,einsum_default_48,call_function,einsum.default,forward,6,2,2,1,776,5133,5
+989,add_34,call_function,add.Tensor,forward,6,2,2,1,777,5132,10
+999,mul_49,call_function,mul.Tensor,forward,7,2,2,1,786,5119,8
+1001,mul_50,call_function,mul.Tensor,forward,7,2,2,1,790,5118,8
+1007,einsum_default_49,call_function,einsum.default,forward,7,2,2,1,797,5100,5
+1011,einsum_default_50,call_function,einsum.default,forward,7,2,2,1,797,5100,5
+1027,mul_51,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8
+1030,mul_52,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8
+1015,einsum_default_51,call_function,einsum.default,forward,7,2,2,1,797,5093,5
+1053,einsum_default_52,call_function,einsum.default,forward,7,2,2,1,843,5080,5
+1054,add_36,call_function,add.Tensor,forward,7,2,2,1,844,5079,10
+1064,mul_53,call_function,mul.Tensor,forward,7,2,2,1,853,5066,8
+1066,mul_54,call_function,mul.Tensor,forward,7,2,2,1,857,5065,8
+1072,einsum_default_53,call_function,einsum.default,forward,7,2,2,1,864,5060,5
+1079,div_7,call_function,div.Tensor,forward,7,2,2,1,871,5042,6
+1084,einsum_default_54,call_function,einsum.default,forward,7,2,2,1,864,5041,5
+1087,mul_55,call_function,mul.Tensor,forward,7,2,2,1,880,5039,8
+1092,einsum_default_55,call_function,einsum.default,forward,7,2,2,1,886,5037,5
+1093,add_39,call_function,add.Tensor,forward,7,2,2,1,887,5036,10
+1103,mul_56,call_function,mul.Tensor,forward,8,2,2,1,896,5023,8
+1105,mul_57,call_function,mul.Tensor,forward,8,2,2,1,900,5022,8
+1111,einsum_default_56,call_function,einsum.default,forward,8,2,2,1,907,5004,5
+1115,einsum_default_57,call_function,einsum.default,forward,8,2,2,1,907,5004,5
+1131,mul_58,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8
+1134,mul_59,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8
+1119,einsum_default_58,call_function,einsum.default,forward,8,2,2,1,907,4997,5
+1157,einsum_default_59,call_function,einsum.default,forward,8,2,2,1,953,4984,5
+1158,add_41,call_function,add.Tensor,forward,8,2,2,1,954,4983,10
+1168,mul_60,call_function,mul.Tensor,forward,8,2,2,1,963,4970,8
+1170,mul_61,call_function,mul.Tensor,forward,8,2,2,1,967,4969,8
+1176,einsum_default_60,call_function,einsum.default,forward,8,2,2,1,974,4964,5
+1183,div_8,call_function,div.Tensor,forward,8,2,2,1,981,4946,6
+1188,einsum_default_61,call_function,einsum.default,forward,8,2,2,1,974,4945,5
+1191,mul_62,call_function,mul.Tensor,forward,8,2,2,1,990,4943,8
+1196,einsum_default_62,call_function,einsum.default,forward,8,2,2,1,996,4941,5
+1197,add_44,call_function,add.Tensor,forward,8,2,2,1,997,4940,10
+1207,mul_63,call_function,mul.Tensor,forward,9,2,2,1,1006,4927,8
+1209,mul_64,call_function,mul.Tensor,forward,9,2,2,1,1010,4926,8
+1215,einsum_default_63,call_function,einsum.default,forward,9,2,2,1,1017,4908,5
+1219,einsum_default_64,call_function,einsum.default,forward,9,2,2,1,1017,4908,5
+1235,mul_65,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8
+1238,mul_66,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8
+1223,einsum_default_65,call_function,einsum.default,forward,9,2,2,1,1017,4901,5
+1261,einsum_default_66,call_function,einsum.default,forward,9,2,2,1,1063,4888,5
+1262,add_46,call_function,add.Tensor,forward,9,2,2,1,1064,4887,10
+1272,mul_67,call_function,mul.Tensor,forward,9,2,2,1,1073,4874,8
+1274,mul_68,call_function,mul.Tensor,forward,9,2,2,1,1077,4873,8
+1280,einsum_default_67,call_function,einsum.default,forward,9,2,2,1,1084,4868,5
+1287,div_9,call_function,div.Tensor,forward,9,2,2,1,1091,4850,6
+1292,einsum_default_68,call_function,einsum.default,forward,9,2,2,1,1084,4849,5
+1295,mul_69,call_function,mul.Tensor,forward,9,2,2,1,1100,4847,8
+1300,einsum_default_69,call_function,einsum.default,forward,9,2,2,1,1106,4845,5
+1301,add_49,call_function,add.Tensor,forward,9,2,2,1,1107,4844,10
+1311,mul_70,call_function,mul.Tensor,forward,10,2,2,1,1116,4831,8
+1313,mul_71,call_function,mul.Tensor,forward,10,2,2,1,1120,4830,8
+1319,einsum_default_70,call_function,einsum.default,forward,10,2,2,1,1127,4812,5
+1323,einsum_default_71,call_function,einsum.default,forward,10,2,2,1,1127,4812,5
+1339,mul_72,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8
+1342,mul_73,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8
+1327,einsum_default_72,call_function,einsum.default,forward,10,2,2,1,1127,4805,5
+1365,einsum_default_73,call_function,einsum.default,forward,10,2,2,1,1173,4792,5
+1366,add_51,call_function,add.Tensor,forward,10,2,2,1,1174,4791,10
+1376,mul_74,call_function,mul.Tensor,forward,10,2,2,1,1183,4778,8
+1378,mul_75,call_function,mul.Tensor,forward,10,2,2,1,1187,4777,8
+1384,einsum_default_74,call_function,einsum.default,forward,10,2,2,1,1194,4772,5
+1391,div_10,call_function,div.Tensor,forward,10,2,2,1,1201,4754,6
+1396,einsum_default_75,call_function,einsum.default,forward,10,2,2,1,1194,4753,5
+1399,mul_76,call_function,mul.Tensor,forward,10,2,2,1,1210,4751,8
+1404,einsum_default_76,call_function,einsum.default,forward,10,2,2,1,1216,4749,5
+1405,add_54,call_function,add.Tensor,forward,10,2,2,1,1217,4748,10
+1415,mul_77,call_function,mul.Tensor,forward,11,2,2,1,1226,4735,8
+1417,mul_78,call_function,mul.Tensor,forward,11,2,2,1,1230,4734,8
+1423,einsum_default_77,call_function,einsum.default,forward,11,2,2,1,1237,4716,5
+1427,einsum_default_78,call_function,einsum.default,forward,11,2,2,1,1237,4716,5
+1443,mul_79,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8
+1446,mul_80,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8
+1431,einsum_default_79,call_function,einsum.default,forward,11,2,2,1,1237,4709,5
+1469,einsum_default_80,call_function,einsum.default,forward,11,2,2,1,1283,4696,5
+1470,add_56,call_function,add.Tensor,forward,11,2,2,1,1284,4695,10
+1480,mul_81,call_function,mul.Tensor,forward,11,2,2,1,1293,4682,8
+1482,mul_82,call_function,mul.Tensor,forward,11,2,2,1,1297,4681,8
+1488,einsum_default_81,call_function,einsum.default,forward,11,2,2,1,1304,4676,5
+1495,div_11,call_function,div.Tensor,forward,11,2,2,1,1311,4658,6
+1500,einsum_default_82,call_function,einsum.default,forward,11,2,2,1,1304,4657,5
+1503,mul_83,call_function,mul.Tensor,forward,11,2,2,1,1320,4655,8
+1508,einsum_default_83,call_function,einsum.default,forward,11,2,2,1,1326,4653,5
+1509,add_59,call_function,add.Tensor,forward,11,2,2,1,1327,4652,10
+1519,mul_84,call_function,mul.Tensor,forward,12,2,2,1,1336,4639,8
+1521,mul_85,call_function,mul.Tensor,forward,12,2,2,1,1340,4638,8
+1527,einsum_default_84,call_function,einsum.default,forward,12,2,2,1,1347,4620,5
+1531,einsum_default_85,call_function,einsum.default,forward,12,2,2,1,1347,4620,5
+1547,mul_86,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8
+1550,mul_87,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8
+1535,einsum_default_86,call_function,einsum.default,forward,12,2,2,1,1347,4613,5
+1573,einsum_default_87,call_function,einsum.default,forward,12,2,2,1,1393,4600,5
+1574,add_61,call_function,add.Tensor,forward,12,2,2,1,1394,4599,10
+1584,mul_88,call_function,mul.Tensor,forward,12,2,2,1,1403,4586,8
+1586,mul_89,call_function,mul.Tensor,forward,12,2,2,1,1407,4585,8
+1592,einsum_default_88,call_function,einsum.default,forward,12,2,2,1,1414,4580,5
+1599,div_12,call_function,div.Tensor,forward,12,2,2,1,1421,4562,6
+1604,einsum_default_89,call_function,einsum.default,forward,12,2,2,1,1414,4561,5
+1607,mul_90,call_function,mul.Tensor,forward,12,2,2,1,1430,4559,8
+1612,einsum_default_90,call_function,einsum.default,forward,12,2,2,1,1436,4557,5
+1613,add_64,call_function,add.Tensor,forward,12,2,2,1,1437,4556,10
+1623,mul_91,call_function,mul.Tensor,forward,13,2,2,1,1446,4543,8
+1625,mul_92,call_function,mul.Tensor,forward,13,2,2,1,1450,4542,8
+1631,einsum_default_91,call_function,einsum.default,forward,13,2,2,1,1457,4524,5
+1635,einsum_default_92,call_function,einsum.default,forward,13,2,2,1,1457,4524,5
+1651,mul_93,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8
+1654,mul_94,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8
+1639,einsum_default_93,call_function,einsum.default,forward,13,2,2,1,1457,4517,5
+1677,einsum_default_94,call_function,einsum.default,forward,13,2,2,1,1503,4504,5
+1678,add_66,call_function,add.Tensor,forward,13,2,2,1,1504,4503,10
+1688,mul_95,call_function,mul.Tensor,forward,13,2,2,1,1513,4490,8
+1690,mul_96,call_function,mul.Tensor,forward,13,2,2,1,1517,4489,8
+1696,einsum_default_95,call_function,einsum.default,forward,13,2,2,1,1524,4484,5
+1703,div_13,call_function,div.Tensor,forward,13,2,2,1,1531,4466,6
+1708,einsum_default_96,call_function,einsum.default,forward,13,2,2,1,1524,4465,5
+1711,mul_97,call_function,mul.Tensor,forward,13,2,2,1,1540,4463,8
+1716,einsum_default_97,call_function,einsum.default,forward,13,2,2,1,1546,4461,5
+1717,add_69,call_function,add.Tensor,forward,13,2,2,1,1547,4460,10
+1727,mul_98,call_function,mul.Tensor,forward,14,2,2,1,1556,4447,8
+1729,mul_99,call_function,mul.Tensor,forward,14,2,2,1,1560,4446,8
+1735,einsum_default_98,call_function,einsum.default,forward,14,2,2,1,1567,4428,5
+1739,einsum_default_99,call_function,einsum.default,forward,14,2,2,1,1567,4428,5
+1755,mul_100,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8
+1758,mul_101,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8
+1743,einsum_default_100,call_function,einsum.default,forward,14,2,2,1,1567,4421,5
+1781,einsum_default_101,call_function,einsum.default,forward,14,2,2,1,1613,4408,5
+1782,add_71,call_function,add.Tensor,forward,14,2,2,1,1614,4407,10
+1792,mul_102,call_function,mul.Tensor,forward,14,2,2,1,1623,4394,8
+1794,mul_103,call_function,mul.Tensor,forward,14,2,2,1,1627,4393,8
+1800,einsum_default_102,call_function,einsum.default,forward,14,2,2,1,1634,4388,5
+1807,div_14,call_function,div.Tensor,forward,14,2,2,1,1641,4370,6
+1812,einsum_default_103,call_function,einsum.default,forward,14,2,2,1,1634,4369,5
+1815,mul_104,call_function,mul.Tensor,forward,14,2,2,1,1650,4367,8
+1820,einsum_default_104,call_function,einsum.default,forward,14,2,2,1,1656,4365,5
+1821,add_74,call_function,add.Tensor,forward,14,2,2,1,1657,4364,10
+1831,mul_105,call_function,mul.Tensor,forward,15,2,2,1,1666,4351,8
+1833,mul_106,call_function,mul.Tensor,forward,15,2,2,1,1670,4350,8
+1839,einsum_default_105,call_function,einsum.default,forward,15,2,2,1,1677,4332,5
+1843,einsum_default_106,call_function,einsum.default,forward,15,2,2,1,1677,4332,5
+1859,mul_107,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8
+1862,mul_108,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8
+1847,einsum_default_107,call_function,einsum.default,forward,15,2,2,1,1677,4325,5
+1885,einsum_default_108,call_function,einsum.default,forward,15,2,2,1,1723,4312,5
+1886,add_76,call_function,add.Tensor,forward,15,2,2,1,1724,4311,10
+1896,mul_109,call_function,mul.Tensor,forward,15,2,2,1,1733,4298,8
+1898,mul_110,call_function,mul.Tensor,forward,15,2,2,1,1737,4297,8
+1904,einsum_default_109,call_function,einsum.default,forward,15,2,2,1,1744,4292,5
+1911,div_15,call_function,div.Tensor,forward,15,2,2,1,1751,4274,6
+1916,einsum_default_110,call_function,einsum.default,forward,15,2,2,1,1744,4273,5
+1919,mul_111,call_function,mul.Tensor,forward,15,2,2,1,1760,4271,8
+1924,einsum_default_111,call_function,einsum.default,forward,15,2,2,1,1766,4269,5
+1925,add_79,call_function,add.Tensor,forward,15,2,2,1,1767,4268,10
+1935,mul_112,call_function,mul.Tensor,forward,16,2,2,1,1776,4255,8
+1937,mul_113,call_function,mul.Tensor,forward,16,2,2,1,1780,4254,8
+1943,einsum_default_112,call_function,einsum.default,forward,16,2,2,1,1787,4236,5
+1947,einsum_default_113,call_function,einsum.default,forward,16,2,2,1,1787,4236,5
+1963,mul_114,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8
+1966,mul_115,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8
+1951,einsum_default_114,call_function,einsum.default,forward,16,2,2,1,1787,4229,5
+1989,einsum_default_115,call_function,einsum.default,forward,16,2,2,1,1833,4216,5
+1990,add_81,call_function,add.Tensor,forward,16,2,2,1,1834,4215,10
+2000,mul_116,call_function,mul.Tensor,forward,16,2,2,1,1843,4202,8
+2002,mul_117,call_function,mul.Tensor,forward,16,2,2,1,1847,4201,8
+2008,einsum_default_116,call_function,einsum.default,forward,16,2,2,1,1854,4196,5
+2015,div_16,call_function,div.Tensor,forward,16,2,2,1,1861,4178,6
+2020,einsum_default_117,call_function,einsum.default,forward,16,2,2,1,1854,4177,5
+2023,mul_118,call_function,mul.Tensor,forward,16,2,2,1,1870,4175,8
+2028,einsum_default_118,call_function,einsum.default,forward,16,2,2,1,1876,4173,5
+2029,add_84,call_function,add.Tensor,forward,16,2,2,1,1877,4172,10
+2039,mul_119,call_function,mul.Tensor,forward,17,2,2,1,1886,4159,8
+2041,mul_120,call_function,mul.Tensor,forward,17,2,2,1,1890,4158,8
+2047,einsum_default_119,call_function,einsum.default,forward,17,2,2,1,1897,4140,5
+2051,einsum_default_120,call_function,einsum.default,forward,17,2,2,1,1897,4140,5
+2067,mul_121,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8
+2070,mul_122,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8
+2055,einsum_default_121,call_function,einsum.default,forward,17,2,2,1,1897,4133,5
+2093,einsum_default_122,call_function,einsum.default,forward,17,2,2,1,1943,4120,5
+2094,add_86,call_function,add.Tensor,forward,17,2,2,1,1944,4119,10
+2104,mul_123,call_function,mul.Tensor,forward,17,2,2,1,1953,4106,8
+2106,mul_124,call_function,mul.Tensor,forward,17,2,2,1,1957,4105,8
+2112,einsum_default_123,call_function,einsum.default,forward,17,2,2,1,1964,4100,5
+2119,div_17,call_function,div.Tensor,forward,17,2,2,1,1971,4082,6
+2124,einsum_default_124,call_function,einsum.default,forward,17,2,2,1,1964,4081,5
+2127,mul_125,call_function,mul.Tensor,forward,17,2,2,1,1980,4079,8
+2132,einsum_default_125,call_function,einsum.default,forward,17,2,2,1,1986,4077,5
+2133,add_89,call_function,add.Tensor,forward,17,2,2,1,1987,4076,10
+2143,mul_126,call_function,mul.Tensor,forward,18,2,2,1,1996,4063,8
+2145,mul_127,call_function,mul.Tensor,forward,18,2,2,1,2000,4062,8
+2151,einsum_default_126,call_function,einsum.default,forward,18,2,2,1,2007,4044,5
+2155,einsum_default_127,call_function,einsum.default,forward,18,2,2,1,2007,4044,5
+2171,mul_128,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8
+2174,mul_129,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8
+2159,einsum_default_128,call_function,einsum.default,forward,18,2,2,1,2007,4037,5
+2197,einsum_default_129,call_function,einsum.default,forward,18,2,2,1,2053,4024,5
+2198,add_91,call_function,add.Tensor,forward,18,2,2,1,2054,4023,10
+2208,mul_130,call_function,mul.Tensor,forward,18,2,2,1,2063,4010,8
+2210,mul_131,call_function,mul.Tensor,forward,18,2,2,1,2067,4009,8
+2216,einsum_default_130,call_function,einsum.default,forward,18,2,2,1,2074,4004,5
+2223,div_18,call_function,div.Tensor,forward,18,2,2,1,2081,3986,6
+2228,einsum_default_131,call_function,einsum.default,forward,18,2,2,1,2074,3985,5
+2231,mul_132,call_function,mul.Tensor,forward,18,2,2,1,2090,3983,8
+2236,einsum_default_132,call_function,einsum.default,forward,18,2,2,1,2096,3981,5
+2237,add_94,call_function,add.Tensor,forward,18,2,2,1,2097,3980,10
+2247,mul_133,call_function,mul.Tensor,forward,19,2,2,1,2106,3967,8
+2249,mul_134,call_function,mul.Tensor,forward,19,2,2,1,2110,3966,8
+2255,einsum_default_133,call_function,einsum.default,forward,19,2,2,1,2117,3948,5
+2259,einsum_default_134,call_function,einsum.default,forward,19,2,2,1,2117,3948,5
+2275,mul_135,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8
+2278,mul_136,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8
+2263,einsum_default_135,call_function,einsum.default,forward,19,2,2,1,2117,3941,5
+2301,einsum_default_136,call_function,einsum.default,forward,19,2,2,1,2163,3928,5
+2302,add_96,call_function,add.Tensor,forward,19,2,2,1,2164,3927,10
+2312,mul_137,call_function,mul.Tensor,forward,19,2,2,1,2173,3914,8
+2314,mul_138,call_function,mul.Tensor,forward,19,2,2,1,2177,3913,8
+2320,einsum_default_137,call_function,einsum.default,forward,19,2,2,1,2184,3908,5
+2327,div_19,call_function,div.Tensor,forward,19,2,2,1,2191,3890,6
+2332,einsum_default_138,call_function,einsum.default,forward,19,2,2,1,2184,3889,5
+2335,mul_139,call_function,mul.Tensor,forward,19,2,2,1,2200,3887,8
+2340,einsum_default_139,call_function,einsum.default,forward,19,2,2,1,2206,3885,5
+2341,add_99,call_function,add.Tensor,forward,19,2,2,1,2207,3884,10
+2351,mul_140,call_function,mul.Tensor,forward,20,2,2,1,2216,3871,8
+2353,mul_141,call_function,mul.Tensor,forward,20,2,2,1,2220,3870,8
+2359,einsum_default_140,call_function,einsum.default,forward,20,2,2,1,2227,3852,5
+2363,einsum_default_141,call_function,einsum.default,forward,20,2,2,1,2227,3852,5
+2379,mul_142,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8
+2382,mul_143,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8
+2367,einsum_default_142,call_function,einsum.default,forward,20,2,2,1,2227,3845,5
+2405,einsum_default_143,call_function,einsum.default,forward,20,2,2,1,2273,3832,5
+2406,add_101,call_function,add.Tensor,forward,20,2,2,1,2274,3831,10
+2416,mul_144,call_function,mul.Tensor,forward,20,2,2,1,2283,3818,8
+2418,mul_145,call_function,mul.Tensor,forward,20,2,2,1,2287,3817,8
+2424,einsum_default_144,call_function,einsum.default,forward,20,2,2,1,2294,3812,5
+2431,div_20,call_function,div.Tensor,forward,20,2,2,1,2301,3794,6
+2436,einsum_default_145,call_function,einsum.default,forward,20,2,2,1,2294,3793,5
+2439,mul_146,call_function,mul.Tensor,forward,20,2,2,1,2310,3791,8
+2444,einsum_default_146,call_function,einsum.default,forward,20,2,2,1,2316,3789,5
+2445,add_104,call_function,add.Tensor,forward,20,2,2,1,2317,3788,10
+2455,mul_147,call_function,mul.Tensor,forward,21,2,2,1,2326,3775,8
+2457,mul_148,call_function,mul.Tensor,forward,21,2,2,1,2330,3774,8
+2463,einsum_default_147,call_function,einsum.default,forward,21,2,2,1,2337,3756,5
+2467,einsum_default_148,call_function,einsum.default,forward,21,2,2,1,2337,3756,5
+2483,mul_149,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8
+2486,mul_150,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8
+2471,einsum_default_149,call_function,einsum.default,forward,21,2,2,1,2337,3749,5
+2509,einsum_default_150,call_function,einsum.default,forward,21,2,2,1,2383,3736,5
+2510,add_106,call_function,add.Tensor,forward,21,2,2,1,2384,3735,10
+2520,mul_151,call_function,mul.Tensor,forward,21,2,2,1,2393,3722,8
+2522,mul_152,call_function,mul.Tensor,forward,21,2,2,1,2397,3721,8
+2528,einsum_default_151,call_function,einsum.default,forward,21,2,2,1,2404,3716,5
+2535,div_21,call_function,div.Tensor,forward,21,2,2,1,2411,3698,6
+2540,einsum_default_152,call_function,einsum.default,forward,21,2,2,1,2404,3697,5
+2543,mul_153,call_function,mul.Tensor,forward,21,2,2,1,2420,3695,8
+2548,einsum_default_153,call_function,einsum.default,forward,21,2,2,1,2426,3693,5
+2549,add_109,call_function,add.Tensor,forward,21,2,2,1,2427,3692,10
+2559,mul_154,call_function,mul.Tensor,forward,22,2,2,1,2436,3679,8
+2561,mul_155,call_function,mul.Tensor,forward,22,2,2,1,2440,3678,8
+2567,einsum_default_154,call_function,einsum.default,forward,22,2,2,1,2447,3660,5
+2571,einsum_default_155,call_function,einsum.default,forward,22,2,2,1,2447,3660,5
+2587,mul_156,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8
+2590,mul_157,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8
+2575,einsum_default_156,call_function,einsum.default,forward,22,2,2,1,2447,3653,5
+2613,einsum_default_157,call_function,einsum.default,forward,22,2,2,1,2493,3640,5
+2614,add_111,call_function,add.Tensor,forward,22,2,2,1,2494,3639,10
+2624,mul_158,call_function,mul.Tensor,forward,22,2,2,1,2503,3626,8
+2626,mul_159,call_function,mul.Tensor,forward,22,2,2,1,2507,3625,8
+2632,einsum_default_158,call_function,einsum.default,forward,22,2,2,1,2514,3620,5
+2639,div_22,call_function,div.Tensor,forward,22,2,2,1,2521,3602,6
+2644,einsum_default_159,call_function,einsum.default,forward,22,2,2,1,2514,3601,5
+2647,mul_160,call_function,mul.Tensor,forward,22,2,2,1,2530,3599,8
+2652,einsum_default_160,call_function,einsum.default,forward,22,2,2,1,2536,3597,5
+2653,add_114,call_function,add.Tensor,forward,22,2,2,1,2537,3596,10
+2663,mul_161,call_function,mul.Tensor,forward,23,2,2,1,2546,3583,8
+2665,mul_162,call_function,mul.Tensor,forward,23,2,2,1,2550,3582,8
+2671,einsum_default_161,call_function,einsum.default,forward,23,2,2,1,2557,3564,5
+2675,einsum_default_162,call_function,einsum.default,forward,23,2,2,1,2557,3564,5
+2691,mul_163,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8
+2694,mul_164,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8
+2679,einsum_default_163,call_function,einsum.default,forward,23,2,2,1,2557,3557,5
+2717,einsum_default_164,call_function,einsum.default,forward,23,2,2,1,2603,3544,5
+2718,add_116,call_function,add.Tensor,forward,23,2,2,1,2604,3543,10
+2728,mul_165,call_function,mul.Tensor,forward,23,2,2,1,2613,3530,8
+2730,mul_166,call_function,mul.Tensor,forward,23,2,2,1,2617,3529,8
+2736,einsum_default_165,call_function,einsum.default,forward,23,2,2,1,2624,3524,5
+2743,div_23,call_function,div.Tensor,forward,23,2,2,1,2631,3506,6
+2748,einsum_default_166,call_function,einsum.default,forward,23,2,2,1,2624,3505,5
+2751,mul_167,call_function,mul.Tensor,forward,23,2,2,1,2640,3503,8
+2756,einsum_default_167,call_function,einsum.default,forward,23,2,2,1,2646,3501,5
+2757,add_119,call_function,add.Tensor,forward,23,2,2,1,2647,3500,10
+2767,mul_168,call_function,mul.Tensor,forward,24,2,2,1,2656,3487,8
+2769,mul_169,call_function,mul.Tensor,forward,24,2,2,1,2660,3486,8
+2775,einsum_default_168,call_function,einsum.default,forward,24,2,2,1,2667,3468,5
+2779,einsum_default_169,call_function,einsum.default,forward,24,2,2,1,2667,3468,5
+2795,mul_170,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8
+2798,mul_171,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8
+2783,einsum_default_170,call_function,einsum.default,forward,24,2,2,1,2667,3461,5
+2821,einsum_default_171,call_function,einsum.default,forward,24,2,2,1,2713,3448,5
+2822,add_121,call_function,add.Tensor,forward,24,2,2,1,2714,3447,10
+2832,mul_172,call_function,mul.Tensor,forward,24,2,2,1,2723,3434,8
+2834,mul_173,call_function,mul.Tensor,forward,24,2,2,1,2727,3433,8
+2840,einsum_default_172,call_function,einsum.default,forward,24,2,2,1,2734,3428,5
+2847,div_24,call_function,div.Tensor,forward,24,2,2,1,2741,3410,6
+2852,einsum_default_173,call_function,einsum.default,forward,24,2,2,1,2734,3409,5
+2855,mul_174,call_function,mul.Tensor,forward,24,2,2,1,2750,3407,8
+2860,einsum_default_174,call_function,einsum.default,forward,24,2,2,1,2756,3405,5
+2861,add_124,call_function,add.Tensor,forward,24,2,2,1,2757,3404,10
+2871,mul_175,call_function,mul.Tensor,forward,25,2,2,1,2766,3391,8
+2873,mul_176,call_function,mul.Tensor,forward,25,2,2,1,2770,3390,8
+2879,einsum_default_175,call_function,einsum.default,forward,25,2,2,1,2777,3372,5
+2883,einsum_default_176,call_function,einsum.default,forward,25,2,2,1,2777,3372,5
+2899,mul_177,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8
+2902,mul_178,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8
+2887,einsum_default_177,call_function,einsum.default,forward,25,2,2,1,2777,3365,5
+2925,einsum_default_178,call_function,einsum.default,forward,25,2,2,1,2823,3352,5
+2926,add_126,call_function,add.Tensor,forward,25,2,2,1,2824,3351,10
+2936,mul_179,call_function,mul.Tensor,forward,25,2,2,1,2833,3338,8
+2938,mul_180,call_function,mul.Tensor,forward,25,2,2,1,2837,3337,8
+2944,einsum_default_179,call_function,einsum.default,forward,25,2,2,1,2844,3332,5
+2951,div_25,call_function,div.Tensor,forward,25,2,2,1,2851,3314,6
+2956,einsum_default_180,call_function,einsum.default,forward,25,2,2,1,2844,3313,5
+2959,mul_181,call_function,mul.Tensor,forward,25,2,2,1,2860,3311,8
+2964,einsum_default_181,call_function,einsum.default,forward,25,2,2,1,2866,3309,5
+2965,add_129,call_function,add.Tensor,forward,25,2,2,1,2867,3308,10
+2975,mul_182,call_function,mul.Tensor,forward,26,2,2,1,2876,3295,8
+2977,mul_183,call_function,mul.Tensor,forward,26,2,2,1,2880,3294,8
+2983,einsum_default_182,call_function,einsum.default,forward,26,2,2,1,2887,3276,5
+2987,einsum_default_183,call_function,einsum.default,forward,26,2,2,1,2887,3276,5
+3003,mul_184,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8
+3006,mul_185,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8
+2991,einsum_default_184,call_function,einsum.default,forward,26,2,2,1,2887,3269,5
+3029,einsum_default_185,call_function,einsum.default,forward,26,2,2,1,2933,3256,5
+3030,add_131,call_function,add.Tensor,forward,26,2,2,1,2934,3255,10
+3040,mul_186,call_function,mul.Tensor,forward,26,2,2,1,2943,3242,8
+3042,mul_187,call_function,mul.Tensor,forward,26,2,2,1,2947,3241,8
+3048,einsum_default_186,call_function,einsum.default,forward,26,2,2,1,2954,3236,5
+3055,div_26,call_function,div.Tensor,forward,26,2,2,1,2961,3218,6
+3060,einsum_default_187,call_function,einsum.default,forward,26,2,2,1,2954,3217,5
+3063,mul_188,call_function,mul.Tensor,forward,26,2,2,1,2970,3215,8
+3068,einsum_default_188,call_function,einsum.default,forward,26,2,2,1,2976,3213,5
+3069,add_134,call_function,add.Tensor,forward,26,2,2,1,2977,3212,10
+3079,mul_189,call_function,mul.Tensor,forward,27,2,2,1,2986,3199,8
+3081,mul_190,call_function,mul.Tensor,forward,27,2,2,1,2990,3198,8
+3087,einsum_default_189,call_function,einsum.default,forward,27,2,2,1,2997,3180,5
+3091,einsum_default_190,call_function,einsum.default,forward,27,2,2,1,2997,3180,5
+3107,mul_191,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8
+3110,mul_192,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8
+3095,einsum_default_191,call_function,einsum.default,forward,27,2,2,1,2997,3173,5
+3133,einsum_default_192,call_function,einsum.default,forward,27,2,2,1,3043,3160,5
+3134,add_136,call_function,add.Tensor,forward,27,2,2,1,3044,3159,10
+3144,mul_193,call_function,mul.Tensor,forward,27,2,2,1,3053,3146,8
+3146,mul_194,call_function,mul.Tensor,forward,27,2,2,1,3057,3145,8
+3152,einsum_default_193,call_function,einsum.default,forward,27,2,2,1,3064,3140,5
+3159,div_27,call_function,div.Tensor,forward,27,2,2,1,3071,3122,6
+3164,einsum_default_194,call_function,einsum.default,forward,27,2,2,1,3064,3121,5
+3167,mul_195,call_function,mul.Tensor,forward,27,2,2,1,3080,3119,8
+3172,einsum_default_195,call_function,einsum.default,forward,27,2,2,1,3086,3117,5
+3173,add_139,call_function,add.Tensor,forward,27,2,2,1,3087,3116,10
+3196,einsum_default_198,call_function,einsum.default,backward,,2,2,1,8,3099,5
+3204,mul_199,call_function,mul.Tensor,backward,,2,2,1,3097,3097,8
+3203,mul_198,call_function,mul.Tensor,backward,,2,2,1,15,3091,8
+3207,mul_200,call_function,mul.Tensor,backward,,2,2,1,3114,3089,8
+3210,mul_201,call_function,mul.Tensor,backward,,2,2,1,3117,3087,8
+3211,sub,call_function,sub.Tensor,backward,,2,2,1,3118,3086,10
+3212,mul_202,call_function,mul.Tensor,backward,,2,2,1,3119,3085,8
+3222,einsum_default_200,call_function,einsum.default,backward,27,2,2,1,3123,3078,5
+3228,mul_205,call_function,mul.Tensor,backward,27,2,2,1,3125,3069,8
+3247,mul_208,call_function,mul.Tensor,backward,27,2,2,1,3075,3068,8
+3245,mul_207,call_function,mul.Tensor,backward,27,2,2,1,3135,3067,8
+3249,mul_209,call_function,mul.Tensor,backward,27,2,2,1,3139,3066,8
+3227,mul_204,call_function,mul.Tensor,backward,27,2,2,1,3125,3065,8
+3232,einsum_default_202,call_function,einsum.default,backward,27,2,2,1,3128,3059,5
+3254,einsum_default_204,call_function,einsum.default,backward,27,2,2,1,3143,3059,5
+3255,add_143,call_function,add.Tensor,unknown,,2,2,1,3148,3058,10
+3264,mul_211,call_function,mul.Tensor,backward,27,2,2,1,3054,3056,8
+3263,mul_210,call_function,mul.Tensor,backward,27,2,2,1,3152,3050,8
+3267,mul_212,call_function,mul.Tensor,backward,27,2,2,1,3157,3048,8
+3270,mul_213,call_function,mul.Tensor,backward,27,2,2,1,3160,3046,8
+3271,sub_2,call_function,sub.Tensor,backward,27,2,2,1,3161,3045,10
+3272,mul_214,call_function,mul.Tensor,backward,27,2,2,1,3162,3044,8
+3277,add_144,call_function,add.Tensor,unknown,,2,2,1,3164,3042,10
+3283,einsum_default_206,call_function,einsum.default,backward,27,2,2,1,3167,3036,5
+3302,mul_216,call_function,mul.Tensor,backward,27,2,2,1,3181,3002,8
+3307,mul_217,call_function,mul.Tensor,backward,27,2,2,1,3181,3001,8
+3320,einsum_default_208,call_function,einsum.default,backward,27,2,2,1,3179,2992,5
+3327,einsum_default_210,call_function,einsum.default,backward,27,2,2,1,3188,2992,5
+3328,add_145,call_function,add.Tensor,unknown,,2,2,1,3195,2991,10
+3335,einsum_default_212,call_function,einsum.default,backward,27,2,2,1,3188,2991,5
+3336,add_146,call_function,add.Tensor,unknown,,2,2,1,3211,2990,10
+3345,mul_219,call_function,mul.Tensor,backward,27,2,2,1,2987,2988,8
+3344,mul_218,call_function,mul.Tensor,backward,27,2,2,1,3215,2982,8
+3348,mul_220,call_function,mul.Tensor,backward,27,2,2,1,3220,2980,8
+3351,mul_221,call_function,mul.Tensor,backward,27,2,2,1,3223,2978,8
+3352,sub_3,call_function,sub.Tensor,backward,27,2,2,1,3224,2977,10
+3353,mul_222,call_function,mul.Tensor,backward,27,2,2,1,3225,2976,8
+3358,add_147,call_function,add.Tensor,unknown,,2,2,1,3227,2974,10
+3364,einsum_default_214,call_function,einsum.default,backward,26,2,2,1,3230,2968,5
+3370,mul_225,call_function,mul.Tensor,backward,26,2,2,1,3232,2959,8
+3389,mul_228,call_function,mul.Tensor,backward,26,2,2,1,2965,2958,8
+3387,mul_227,call_function,mul.Tensor,backward,26,2,2,1,3242,2957,8
+3391,mul_229,call_function,mul.Tensor,backward,26,2,2,1,3246,2956,8
+3369,mul_224,call_function,mul.Tensor,backward,26,2,2,1,3232,2955,8
+3374,einsum_default_216,call_function,einsum.default,backward,26,2,2,1,3235,2949,5
+3396,einsum_default_218,call_function,einsum.default,backward,26,2,2,1,3250,2949,5
+3397,add_150,call_function,add.Tensor,unknown,,2,2,1,3255,2948,10
+3406,mul_231,call_function,mul.Tensor,backward,26,2,2,1,2944,2946,8
+3405,mul_230,call_function,mul.Tensor,backward,26,2,2,1,3259,2940,8
+3409,mul_232,call_function,mul.Tensor,backward,26,2,2,1,3264,2938,8
+3412,mul_233,call_function,mul.Tensor,backward,26,2,2,1,3267,2936,8
+3413,sub_5,call_function,sub.Tensor,backward,26,2,2,1,3268,2935,10
+3414,mul_234,call_function,mul.Tensor,backward,26,2,2,1,3269,2934,8
+3419,add_151,call_function,add.Tensor,unknown,,2,2,1,3271,2932,10
+3425,einsum_default_220,call_function,einsum.default,backward,26,2,2,1,3274,2926,5
+3444,mul_236,call_function,mul.Tensor,backward,26,2,2,1,3288,2892,8
+3449,mul_237,call_function,mul.Tensor,backward,26,2,2,1,3288,2891,8
+3462,einsum_default_222,call_function,einsum.default,backward,26,2,2,1,3286,2882,5
+3469,einsum_default_224,call_function,einsum.default,backward,26,2,2,1,3295,2882,5
+3470,add_152,call_function,add.Tensor,unknown,,2,2,1,3302,2881,10
+3477,einsum_default_226,call_function,einsum.default,backward,26,2,2,1,3295,2881,5
+3478,add_153,call_function,add.Tensor,unknown,,2,2,1,3318,2880,10
+3487,mul_239,call_function,mul.Tensor,backward,26,2,2,1,2877,2878,8
+3486,mul_238,call_function,mul.Tensor,backward,26,2,2,1,3322,2872,8
+3490,mul_240,call_function,mul.Tensor,backward,26,2,2,1,3327,2870,8
+3493,mul_241,call_function,mul.Tensor,backward,26,2,2,1,3330,2868,8
+3494,sub_6,call_function,sub.Tensor,backward,26,2,2,1,3331,2867,10
+3495,mul_242,call_function,mul.Tensor,backward,26,2,2,1,3332,2866,8
+3500,add_154,call_function,add.Tensor,unknown,,2,2,1,3334,2864,10
+3506,einsum_default_228,call_function,einsum.default,backward,25,2,2,1,3337,2858,5
+3512,mul_245,call_function,mul.Tensor,backward,25,2,2,1,3339,2849,8
+3531,mul_248,call_function,mul.Tensor,backward,25,2,2,1,2855,2848,8
+3529,mul_247,call_function,mul.Tensor,backward,25,2,2,1,3349,2847,8
+3533,mul_249,call_function,mul.Tensor,backward,25,2,2,1,3353,2846,8
+3511,mul_244,call_function,mul.Tensor,backward,25,2,2,1,3339,2845,8
+3516,einsum_default_230,call_function,einsum.default,backward,25,2,2,1,3342,2839,5
+3538,einsum_default_232,call_function,einsum.default,backward,25,2,2,1,3357,2839,5
+3539,add_157,call_function,add.Tensor,unknown,,2,2,1,3362,2838,10
+3548,mul_251,call_function,mul.Tensor,backward,25,2,2,1,2834,2836,8
+3547,mul_250,call_function,mul.Tensor,backward,25,2,2,1,3366,2830,8
+3551,mul_252,call_function,mul.Tensor,backward,25,2,2,1,3371,2828,8
+3554,mul_253,call_function,mul.Tensor,backward,25,2,2,1,3374,2826,8
+3555,sub_8,call_function,sub.Tensor,backward,25,2,2,1,3375,2825,10
+3556,mul_254,call_function,mul.Tensor,backward,25,2,2,1,3376,2824,8
+3561,add_158,call_function,add.Tensor,unknown,,2,2,1,3378,2822,10
+3567,einsum_default_234,call_function,einsum.default,backward,25,2,2,1,3381,2816,5
+3586,mul_256,call_function,mul.Tensor,backward,25,2,2,1,3395,2782,8
+3591,mul_257,call_function,mul.Tensor,backward,25,2,2,1,3395,2781,8
+3604,einsum_default_236,call_function,einsum.default,backward,25,2,2,1,3393,2772,5
+3611,einsum_default_238,call_function,einsum.default,backward,25,2,2,1,3402,2772,5
+3612,add_159,call_function,add.Tensor,unknown,,2,2,1,3409,2771,10
+3619,einsum_default_240,call_function,einsum.default,backward,25,2,2,1,3402,2771,5
+3620,add_160,call_function,add.Tensor,unknown,,2,2,1,3425,2770,10
+3629,mul_259,call_function,mul.Tensor,backward,25,2,2,1,2767,2768,8
+3628,mul_258,call_function,mul.Tensor,backward,25,2,2,1,3429,2762,8
+3632,mul_260,call_function,mul.Tensor,backward,25,2,2,1,3434,2760,8
+3635,mul_261,call_function,mul.Tensor,backward,25,2,2,1,3437,2758,8
+3636,sub_9,call_function,sub.Tensor,backward,25,2,2,1,3438,2757,10
+3637,mul_262,call_function,mul.Tensor,backward,25,2,2,1,3439,2756,8
+3642,add_161,call_function,add.Tensor,unknown,,2,2,1,3441,2754,10
+3648,einsum_default_242,call_function,einsum.default,backward,24,2,2,1,3444,2748,5
+3654,mul_265,call_function,mul.Tensor,backward,24,2,2,1,3446,2739,8
+3673,mul_268,call_function,mul.Tensor,backward,24,2,2,1,2745,2738,8
+3671,mul_267,call_function,mul.Tensor,backward,24,2,2,1,3456,2737,8
+3675,mul_269,call_function,mul.Tensor,backward,24,2,2,1,3460,2736,8
+3653,mul_264,call_function,mul.Tensor,backward,24,2,2,1,3446,2735,8
+3658,einsum_default_244,call_function,einsum.default,backward,24,2,2,1,3449,2729,5
+3680,einsum_default_246,call_function,einsum.default,backward,24,2,2,1,3464,2729,5
+3681,add_164,call_function,add.Tensor,unknown,,2,2,1,3469,2728,10
+3690,mul_271,call_function,mul.Tensor,backward,24,2,2,1,2724,2726,8
+3689,mul_270,call_function,mul.Tensor,backward,24,2,2,1,3473,2720,8
+3693,mul_272,call_function,mul.Tensor,backward,24,2,2,1,3478,2718,8
+3696,mul_273,call_function,mul.Tensor,backward,24,2,2,1,3481,2716,8
+3697,sub_11,call_function,sub.Tensor,backward,24,2,2,1,3482,2715,10
+3698,mul_274,call_function,mul.Tensor,backward,24,2,2,1,3483,2714,8
+3703,add_165,call_function,add.Tensor,unknown,,2,2,1,3485,2712,10
+3709,einsum_default_248,call_function,einsum.default,backward,24,2,2,1,3488,2706,5
+3728,mul_276,call_function,mul.Tensor,backward,24,2,2,1,3502,2672,8
+3733,mul_277,call_function,mul.Tensor,backward,24,2,2,1,3502,2671,8
+3746,einsum_default_250,call_function,einsum.default,backward,24,2,2,1,3500,2662,5
+3753,einsum_default_252,call_function,einsum.default,backward,24,2,2,1,3509,2662,5
+3754,add_166,call_function,add.Tensor,unknown,,2,2,1,3516,2661,10
+3761,einsum_default_254,call_function,einsum.default,backward,24,2,2,1,3509,2661,5
+3762,add_167,call_function,add.Tensor,unknown,,2,2,1,3532,2660,10
+3771,mul_279,call_function,mul.Tensor,backward,24,2,2,1,2657,2658,8
+3770,mul_278,call_function,mul.Tensor,backward,24,2,2,1,3536,2652,8
+3774,mul_280,call_function,mul.Tensor,backward,24,2,2,1,3541,2650,8
+3777,mul_281,call_function,mul.Tensor,backward,24,2,2,1,3544,2648,8
+3778,sub_12,call_function,sub.Tensor,backward,24,2,2,1,3545,2647,10
+3779,mul_282,call_function,mul.Tensor,backward,24,2,2,1,3546,2646,8
+3784,add_168,call_function,add.Tensor,unknown,,2,2,1,3548,2644,10
+3790,einsum_default_256,call_function,einsum.default,backward,23,2,2,1,3551,2638,5
+3796,mul_285,call_function,mul.Tensor,backward,23,2,2,1,3553,2629,8
+3815,mul_288,call_function,mul.Tensor,backward,23,2,2,1,2635,2628,8
+3813,mul_287,call_function,mul.Tensor,backward,23,2,2,1,3563,2627,8
+3817,mul_289,call_function,mul.Tensor,backward,23,2,2,1,3567,2626,8
+3795,mul_284,call_function,mul.Tensor,backward,23,2,2,1,3553,2625,8
+3800,einsum_default_258,call_function,einsum.default,backward,23,2,2,1,3556,2619,5
+3822,einsum_default_260,call_function,einsum.default,backward,23,2,2,1,3571,2619,5
+3823,add_171,call_function,add.Tensor,unknown,,2,2,1,3576,2618,10
+3832,mul_291,call_function,mul.Tensor,backward,23,2,2,1,2614,2616,8
+3831,mul_290,call_function,mul.Tensor,backward,23,2,2,1,3580,2610,8
+3835,mul_292,call_function,mul.Tensor,backward,23,2,2,1,3585,2608,8
+3838,mul_293,call_function,mul.Tensor,backward,23,2,2,1,3588,2606,8
+3839,sub_14,call_function,sub.Tensor,backward,23,2,2,1,3589,2605,10
+3840,mul_294,call_function,mul.Tensor,backward,23,2,2,1,3590,2604,8
+3845,add_172,call_function,add.Tensor,unknown,,2,2,1,3592,2602,10
+3851,einsum_default_262,call_function,einsum.default,backward,23,2,2,1,3595,2596,5
+3870,mul_296,call_function,mul.Tensor,backward,23,2,2,1,3609,2562,8
+3875,mul_297,call_function,mul.Tensor,backward,23,2,2,1,3609,2561,8
+3888,einsum_default_264,call_function,einsum.default,backward,23,2,2,1,3607,2552,5
+3895,einsum_default_266,call_function,einsum.default,backward,23,2,2,1,3616,2552,5
+3896,add_173,call_function,add.Tensor,unknown,,2,2,1,3623,2551,10
+3903,einsum_default_268,call_function,einsum.default,backward,23,2,2,1,3616,2551,5
+3904,add_174,call_function,add.Tensor,unknown,,2,2,1,3639,2550,10
+3913,mul_299,call_function,mul.Tensor,backward,23,2,2,1,2547,2548,8
+3912,mul_298,call_function,mul.Tensor,backward,23,2,2,1,3643,2542,8
+3916,mul_300,call_function,mul.Tensor,backward,23,2,2,1,3648,2540,8
+3919,mul_301,call_function,mul.Tensor,backward,23,2,2,1,3651,2538,8
+3920,sub_15,call_function,sub.Tensor,backward,23,2,2,1,3652,2537,10
+3921,mul_302,call_function,mul.Tensor,backward,23,2,2,1,3653,2536,8
+3926,add_175,call_function,add.Tensor,unknown,,2,2,1,3655,2534,10
+3932,einsum_default_270,call_function,einsum.default,backward,22,2,2,1,3658,2528,5
+3938,mul_305,call_function,mul.Tensor,backward,22,2,2,1,3660,2519,8
+3957,mul_308,call_function,mul.Tensor,backward,22,2,2,1,2525,2518,8
+3955,mul_307,call_function,mul.Tensor,backward,22,2,2,1,3670,2517,8
+3959,mul_309,call_function,mul.Tensor,backward,22,2,2,1,3674,2516,8
+3937,mul_304,call_function,mul.Tensor,backward,22,2,2,1,3660,2515,8
+3942,einsum_default_272,call_function,einsum.default,backward,22,2,2,1,3663,2509,5
+3964,einsum_default_274,call_function,einsum.default,backward,22,2,2,1,3678,2509,5
+3965,add_178,call_function,add.Tensor,unknown,,2,2,1,3683,2508,10
+3974,mul_311,call_function,mul.Tensor,backward,22,2,2,1,2504,2506,8
+3973,mul_310,call_function,mul.Tensor,backward,22,2,2,1,3687,2500,8
+3977,mul_312,call_function,mul.Tensor,backward,22,2,2,1,3692,2498,8
+3980,mul_313,call_function,mul.Tensor,backward,22,2,2,1,3695,2496,8
+3981,sub_17,call_function,sub.Tensor,backward,22,2,2,1,3696,2495,10
+3982,mul_314,call_function,mul.Tensor,backward,22,2,2,1,3697,2494,8
+3987,add_179,call_function,add.Tensor,unknown,,2,2,1,3699,2492,10
+3993,einsum_default_276,call_function,einsum.default,backward,22,2,2,1,3702,2486,5
+4012,mul_316,call_function,mul.Tensor,backward,22,2,2,1,3716,2452,8
+4017,mul_317,call_function,mul.Tensor,backward,22,2,2,1,3716,2451,8
+4030,einsum_default_278,call_function,einsum.default,backward,22,2,2,1,3714,2442,5
+4037,einsum_default_280,call_function,einsum.default,backward,22,2,2,1,3723,2442,5
+4038,add_180,call_function,add.Tensor,unknown,,2,2,1,3730,2441,10
+4045,einsum_default_282,call_function,einsum.default,backward,22,2,2,1,3723,2441,5
+4046,add_181,call_function,add.Tensor,unknown,,2,2,1,3746,2440,10
+4055,mul_319,call_function,mul.Tensor,backward,22,2,2,1,2437,2438,8
+4054,mul_318,call_function,mul.Tensor,backward,22,2,2,1,3750,2432,8
+4058,mul_320,call_function,mul.Tensor,backward,22,2,2,1,3755,2430,8
+4061,mul_321,call_function,mul.Tensor,backward,22,2,2,1,3758,2428,8
+4062,sub_18,call_function,sub.Tensor,backward,22,2,2,1,3759,2427,10
+4063,mul_322,call_function,mul.Tensor,backward,22,2,2,1,3760,2426,8
+4068,add_182,call_function,add.Tensor,unknown,,2,2,1,3762,2424,10
+4074,einsum_default_284,call_function,einsum.default,backward,21,2,2,1,3765,2418,5
+4080,mul_325,call_function,mul.Tensor,backward,21,2,2,1,3767,2409,8
+4099,mul_328,call_function,mul.Tensor,backward,21,2,2,1,2415,2408,8
+4097,mul_327,call_function,mul.Tensor,backward,21,2,2,1,3777,2407,8
+4101,mul_329,call_function,mul.Tensor,backward,21,2,2,1,3781,2406,8
+4079,mul_324,call_function,mul.Tensor,backward,21,2,2,1,3767,2405,8
+4084,einsum_default_286,call_function,einsum.default,backward,21,2,2,1,3770,2399,5
+4106,einsum_default_288,call_function,einsum.default,backward,21,2,2,1,3785,2399,5
+4107,add_185,call_function,add.Tensor,unknown,,2,2,1,3790,2398,10
+4116,mul_331,call_function,mul.Tensor,backward,21,2,2,1,2394,2396,8
+4115,mul_330,call_function,mul.Tensor,backward,21,2,2,1,3794,2390,8
+4119,mul_332,call_function,mul.Tensor,backward,21,2,2,1,3799,2388,8
+4122,mul_333,call_function,mul.Tensor,backward,21,2,2,1,3802,2386,8
+4123,sub_20,call_function,sub.Tensor,backward,21,2,2,1,3803,2385,10
+4124,mul_334,call_function,mul.Tensor,backward,21,2,2,1,3804,2384,8
+4129,add_186,call_function,add.Tensor,unknown,,2,2,1,3806,2382,10
+4135,einsum_default_290,call_function,einsum.default,backward,21,2,2,1,3809,2376,5
+4154,mul_336,call_function,mul.Tensor,backward,21,2,2,1,3823,2342,8
+4159,mul_337,call_function,mul.Tensor,backward,21,2,2,1,3823,2341,8
+4172,einsum_default_292,call_function,einsum.default,backward,21,2,2,1,3821,2332,5
+4179,einsum_default_294,call_function,einsum.default,backward,21,2,2,1,3830,2332,5
+4180,add_187,call_function,add.Tensor,unknown,,2,2,1,3837,2331,10
+4187,einsum_default_296,call_function,einsum.default,backward,21,2,2,1,3830,2331,5
+4188,add_188,call_function,add.Tensor,unknown,,2,2,1,3853,2330,10
+4197,mul_339,call_function,mul.Tensor,backward,21,2,2,1,2327,2328,8
+4196,mul_338,call_function,mul.Tensor,backward,21,2,2,1,3857,2322,8
+4200,mul_340,call_function,mul.Tensor,backward,21,2,2,1,3862,2320,8
+4203,mul_341,call_function,mul.Tensor,backward,21,2,2,1,3865,2318,8
+4204,sub_21,call_function,sub.Tensor,backward,21,2,2,1,3866,2317,10
+4205,mul_342,call_function,mul.Tensor,backward,21,2,2,1,3867,2316,8
+4210,add_189,call_function,add.Tensor,unknown,,2,2,1,3869,2314,10
+4216,einsum_default_298,call_function,einsum.default,backward,20,2,2,1,3872,2308,5
+4222,mul_345,call_function,mul.Tensor,backward,20,2,2,1,3874,2299,8
+4241,mul_348,call_function,mul.Tensor,backward,20,2,2,1,2305,2298,8
+4239,mul_347,call_function,mul.Tensor,backward,20,2,2,1,3884,2297,8
+4243,mul_349,call_function,mul.Tensor,backward,20,2,2,1,3888,2296,8
+4221,mul_344,call_function,mul.Tensor,backward,20,2,2,1,3874,2295,8
+4226,einsum_default_300,call_function,einsum.default,backward,20,2,2,1,3877,2289,5
+4248,einsum_default_302,call_function,einsum.default,backward,20,2,2,1,3892,2289,5
+4249,add_192,call_function,add.Tensor,unknown,,2,2,1,3897,2288,10
+4258,mul_351,call_function,mul.Tensor,backward,20,2,2,1,2284,2286,8
+4257,mul_350,call_function,mul.Tensor,backward,20,2,2,1,3901,2280,8
+4261,mul_352,call_function,mul.Tensor,backward,20,2,2,1,3906,2278,8
+4264,mul_353,call_function,mul.Tensor,backward,20,2,2,1,3909,2276,8
+4265,sub_23,call_function,sub.Tensor,backward,20,2,2,1,3910,2275,10
+4266,mul_354,call_function,mul.Tensor,backward,20,2,2,1,3911,2274,8
+4271,add_193,call_function,add.Tensor,unknown,,2,2,1,3913,2272,10
+4277,einsum_default_304,call_function,einsum.default,backward,20,2,2,1,3916,2266,5
+4296,mul_356,call_function,mul.Tensor,backward,20,2,2,1,3930,2232,8
+4301,mul_357,call_function,mul.Tensor,backward,20,2,2,1,3930,2231,8
+4314,einsum_default_306,call_function,einsum.default,backward,20,2,2,1,3928,2222,5
+4321,einsum_default_308,call_function,einsum.default,backward,20,2,2,1,3937,2222,5
+4322,add_194,call_function,add.Tensor,unknown,,2,2,1,3944,2221,10
+4329,einsum_default_310,call_function,einsum.default,backward,20,2,2,1,3937,2221,5
+4330,add_195,call_function,add.Tensor,unknown,,2,2,1,3960,2220,10
+4339,mul_359,call_function,mul.Tensor,backward,20,2,2,1,2217,2218,8
+4338,mul_358,call_function,mul.Tensor,backward,20,2,2,1,3964,2212,8
+4342,mul_360,call_function,mul.Tensor,backward,20,2,2,1,3969,2210,8
+4345,mul_361,call_function,mul.Tensor,backward,20,2,2,1,3972,2208,8
+4346,sub_24,call_function,sub.Tensor,backward,20,2,2,1,3973,2207,10
+4347,mul_362,call_function,mul.Tensor,backward,20,2,2,1,3974,2206,8
+4352,add_196,call_function,add.Tensor,unknown,,2,2,1,3976,2204,10
+4358,einsum_default_312,call_function,einsum.default,backward,19,2,2,1,3979,2198,5
+4364,mul_365,call_function,mul.Tensor,backward,19,2,2,1,3981,2189,8
+4383,mul_368,call_function,mul.Tensor,backward,19,2,2,1,2195,2188,8
+4381,mul_367,call_function,mul.Tensor,backward,19,2,2,1,3991,2187,8
+4385,mul_369,call_function,mul.Tensor,backward,19,2,2,1,3995,2186,8
+4363,mul_364,call_function,mul.Tensor,backward,19,2,2,1,3981,2185,8
+4368,einsum_default_314,call_function,einsum.default,backward,19,2,2,1,3984,2179,5
+4390,einsum_default_316,call_function,einsum.default,backward,19,2,2,1,3999,2179,5
+4391,add_199,call_function,add.Tensor,unknown,,2,2,1,4004,2178,10
+4400,mul_371,call_function,mul.Tensor,backward,19,2,2,1,2174,2176,8
+4399,mul_370,call_function,mul.Tensor,backward,19,2,2,1,4008,2170,8
+4403,mul_372,call_function,mul.Tensor,backward,19,2,2,1,4013,2168,8
+4406,mul_373,call_function,mul.Tensor,backward,19,2,2,1,4016,2166,8
+4407,sub_26,call_function,sub.Tensor,backward,19,2,2,1,4017,2165,10
+4408,mul_374,call_function,mul.Tensor,backward,19,2,2,1,4018,2164,8
+4413,add_200,call_function,add.Tensor,unknown,,2,2,1,4020,2162,10
+4419,einsum_default_318,call_function,einsum.default,backward,19,2,2,1,4023,2156,5
+4438,mul_376,call_function,mul.Tensor,backward,19,2,2,1,4037,2122,8
+4443,mul_377,call_function,mul.Tensor,backward,19,2,2,1,4037,2121,8
+4456,einsum_default_320,call_function,einsum.default,backward,19,2,2,1,4035,2112,5
+4463,einsum_default_322,call_function,einsum.default,backward,19,2,2,1,4044,2112,5
+4464,add_201,call_function,add.Tensor,unknown,,2,2,1,4051,2111,10
+4471,einsum_default_324,call_function,einsum.default,backward,19,2,2,1,4044,2111,5
+4472,add_202,call_function,add.Tensor,unknown,,2,2,1,4067,2110,10
+4481,mul_379,call_function,mul.Tensor,backward,19,2,2,1,2107,2108,8
+4480,mul_378,call_function,mul.Tensor,backward,19,2,2,1,4071,2102,8
+4484,mul_380,call_function,mul.Tensor,backward,19,2,2,1,4076,2100,8
+4487,mul_381,call_function,mul.Tensor,backward,19,2,2,1,4079,2098,8
+4488,sub_27,call_function,sub.Tensor,backward,19,2,2,1,4080,2097,10
+4489,mul_382,call_function,mul.Tensor,backward,19,2,2,1,4081,2096,8
+4494,add_203,call_function,add.Tensor,unknown,,2,2,1,4083,2094,10
+4500,einsum_default_326,call_function,einsum.default,backward,18,2,2,1,4086,2088,5
+4506,mul_385,call_function,mul.Tensor,backward,18,2,2,1,4088,2079,8
+4525,mul_388,call_function,mul.Tensor,backward,18,2,2,1,2085,2078,8
+4523,mul_387,call_function,mul.Tensor,backward,18,2,2,1,4098,2077,8
+4527,mul_389,call_function,mul.Tensor,backward,18,2,2,1,4102,2076,8
+4505,mul_384,call_function,mul.Tensor,backward,18,2,2,1,4088,2075,8
+4510,einsum_default_328,call_function,einsum.default,backward,18,2,2,1,4091,2069,5
+4532,einsum_default_330,call_function,einsum.default,backward,18,2,2,1,4106,2069,5
+4533,add_206,call_function,add.Tensor,unknown,,2,2,1,4111,2068,10
+4542,mul_391,call_function,mul.Tensor,backward,18,2,2,1,2064,2066,8
+4541,mul_390,call_function,mul.Tensor,backward,18,2,2,1,4115,2060,8
+4545,mul_392,call_function,mul.Tensor,backward,18,2,2,1,4120,2058,8
+4548,mul_393,call_function,mul.Tensor,backward,18,2,2,1,4123,2056,8
+4549,sub_29,call_function,sub.Tensor,backward,18,2,2,1,4124,2055,10
+4550,mul_394,call_function,mul.Tensor,backward,18,2,2,1,4125,2054,8
+4555,add_207,call_function,add.Tensor,unknown,,2,2,1,4127,2052,10
+4561,einsum_default_332,call_function,einsum.default,backward,18,2,2,1,4130,2046,5
+4580,mul_396,call_function,mul.Tensor,backward,18,2,2,1,4144,2012,8
+4585,mul_397,call_function,mul.Tensor,backward,18,2,2,1,4144,2011,8
+4598,einsum_default_334,call_function,einsum.default,backward,18,2,2,1,4142,2002,5
+4605,einsum_default_336,call_function,einsum.default,backward,18,2,2,1,4151,2002,5
+4606,add_208,call_function,add.Tensor,unknown,,2,2,1,4158,2001,10
+4613,einsum_default_338,call_function,einsum.default,backward,18,2,2,1,4151,2001,5
+4614,add_209,call_function,add.Tensor,unknown,,2,2,1,4174,2000,10
+4623,mul_399,call_function,mul.Tensor,backward,18,2,2,1,1997,1998,8
+4622,mul_398,call_function,mul.Tensor,backward,18,2,2,1,4178,1992,8
+4626,mul_400,call_function,mul.Tensor,backward,18,2,2,1,4183,1990,8
+4629,mul_401,call_function,mul.Tensor,backward,18,2,2,1,4186,1988,8
+4630,sub_30,call_function,sub.Tensor,backward,18,2,2,1,4187,1987,10
+4631,mul_402,call_function,mul.Tensor,backward,18,2,2,1,4188,1986,8
+4636,add_210,call_function,add.Tensor,unknown,,2,2,1,4190,1984,10
+4642,einsum_default_340,call_function,einsum.default,backward,17,2,2,1,4193,1978,5
+4648,mul_405,call_function,mul.Tensor,backward,17,2,2,1,4195,1969,8
+4667,mul_408,call_function,mul.Tensor,backward,17,2,2,1,1975,1968,8
+4665,mul_407,call_function,mul.Tensor,backward,17,2,2,1,4205,1967,8
+4669,mul_409,call_function,mul.Tensor,backward,17,2,2,1,4209,1966,8
+4647,mul_404,call_function,mul.Tensor,backward,17,2,2,1,4195,1965,8
+4652,einsum_default_342,call_function,einsum.default,backward,17,2,2,1,4198,1959,5
+4674,einsum_default_344,call_function,einsum.default,backward,17,2,2,1,4213,1959,5
+4675,add_213,call_function,add.Tensor,unknown,,2,2,1,4218,1958,10
+4684,mul_411,call_function,mul.Tensor,backward,17,2,2,1,1954,1956,8
+4683,mul_410,call_function,mul.Tensor,backward,17,2,2,1,4222,1950,8
+4687,mul_412,call_function,mul.Tensor,backward,17,2,2,1,4227,1948,8
+4690,mul_413,call_function,mul.Tensor,backward,17,2,2,1,4230,1946,8
+4691,sub_32,call_function,sub.Tensor,backward,17,2,2,1,4231,1945,10
+4692,mul_414,call_function,mul.Tensor,backward,17,2,2,1,4232,1944,8
+4697,add_214,call_function,add.Tensor,unknown,,2,2,1,4234,1942,10
+4703,einsum_default_346,call_function,einsum.default,backward,17,2,2,1,4237,1936,5
+4722,mul_416,call_function,mul.Tensor,backward,17,2,2,1,4251,1902,8
+4727,mul_417,call_function,mul.Tensor,backward,17,2,2,1,4251,1901,8
+4740,einsum_default_348,call_function,einsum.default,backward,17,2,2,1,4249,1892,5
+4747,einsum_default_350,call_function,einsum.default,backward,17,2,2,1,4258,1892,5
+4748,add_215,call_function,add.Tensor,unknown,,2,2,1,4265,1891,10
+4755,einsum_default_352,call_function,einsum.default,backward,17,2,2,1,4258,1891,5
+4756,add_216,call_function,add.Tensor,unknown,,2,2,1,4281,1890,10
+4765,mul_419,call_function,mul.Tensor,backward,17,2,2,1,1887,1888,8
+4764,mul_418,call_function,mul.Tensor,backward,17,2,2,1,4285,1882,8
+4768,mul_420,call_function,mul.Tensor,backward,17,2,2,1,4290,1880,8
+4771,mul_421,call_function,mul.Tensor,backward,17,2,2,1,4293,1878,8
+4772,sub_33,call_function,sub.Tensor,backward,17,2,2,1,4294,1877,10
+4773,mul_422,call_function,mul.Tensor,backward,17,2,2,1,4295,1876,8
+4778,add_217,call_function,add.Tensor,unknown,,2,2,1,4297,1874,10
+4784,einsum_default_354,call_function,einsum.default,backward,16,2,2,1,4300,1868,5
+4790,mul_425,call_function,mul.Tensor,backward,16,2,2,1,4302,1859,8
+4809,mul_428,call_function,mul.Tensor,backward,16,2,2,1,1865,1858,8
+4807,mul_427,call_function,mul.Tensor,backward,16,2,2,1,4312,1857,8
+4811,mul_429,call_function,mul.Tensor,backward,16,2,2,1,4316,1856,8
+4789,mul_424,call_function,mul.Tensor,backward,16,2,2,1,4302,1855,8
+4794,einsum_default_356,call_function,einsum.default,backward,16,2,2,1,4305,1849,5
+4816,einsum_default_358,call_function,einsum.default,backward,16,2,2,1,4320,1849,5
+4817,add_220,call_function,add.Tensor,unknown,,2,2,1,4325,1848,10
+4826,mul_431,call_function,mul.Tensor,backward,16,2,2,1,1844,1846,8
+4825,mul_430,call_function,mul.Tensor,backward,16,2,2,1,4329,1840,8
+4829,mul_432,call_function,mul.Tensor,backward,16,2,2,1,4334,1838,8
+4832,mul_433,call_function,mul.Tensor,backward,16,2,2,1,4337,1836,8
+4833,sub_35,call_function,sub.Tensor,backward,16,2,2,1,4338,1835,10
+4834,mul_434,call_function,mul.Tensor,backward,16,2,2,1,4339,1834,8
+4839,add_221,call_function,add.Tensor,unknown,,2,2,1,4341,1832,10
+4845,einsum_default_360,call_function,einsum.default,backward,16,2,2,1,4344,1826,5
+4864,mul_436,call_function,mul.Tensor,backward,16,2,2,1,4358,1792,8
+4869,mul_437,call_function,mul.Tensor,backward,16,2,2,1,4358,1791,8
+4882,einsum_default_362,call_function,einsum.default,backward,16,2,2,1,4356,1782,5
+4889,einsum_default_364,call_function,einsum.default,backward,16,2,2,1,4365,1782,5
+4890,add_222,call_function,add.Tensor,unknown,,2,2,1,4372,1781,10
+4897,einsum_default_366,call_function,einsum.default,backward,16,2,2,1,4365,1781,5
+4898,add_223,call_function,add.Tensor,unknown,,2,2,1,4388,1780,10
+4907,mul_439,call_function,mul.Tensor,backward,16,2,2,1,1777,1778,8
+4906,mul_438,call_function,mul.Tensor,backward,16,2,2,1,4392,1772,8
+4910,mul_440,call_function,mul.Tensor,backward,16,2,2,1,4397,1770,8
+4913,mul_441,call_function,mul.Tensor,backward,16,2,2,1,4400,1768,8
+4914,sub_36,call_function,sub.Tensor,backward,16,2,2,1,4401,1767,10
+4915,mul_442,call_function,mul.Tensor,backward,16,2,2,1,4402,1766,8
+4920,add_224,call_function,add.Tensor,unknown,,2,2,1,4404,1764,10
+4926,einsum_default_368,call_function,einsum.default,backward,15,2,2,1,4407,1758,5
+4932,mul_445,call_function,mul.Tensor,backward,15,2,2,1,4409,1749,8
+4951,mul_448,call_function,mul.Tensor,backward,15,2,2,1,1755,1748,8
+4949,mul_447,call_function,mul.Tensor,backward,15,2,2,1,4419,1747,8
+4953,mul_449,call_function,mul.Tensor,backward,15,2,2,1,4423,1746,8
+4931,mul_444,call_function,mul.Tensor,backward,15,2,2,1,4409,1745,8
+4936,einsum_default_370,call_function,einsum.default,backward,15,2,2,1,4412,1739,5
+4958,einsum_default_372,call_function,einsum.default,backward,15,2,2,1,4427,1739,5
+4959,add_227,call_function,add.Tensor,unknown,,2,2,1,4432,1738,10
+4968,mul_451,call_function,mul.Tensor,backward,15,2,2,1,1734,1736,8
+4967,mul_450,call_function,mul.Tensor,backward,15,2,2,1,4436,1730,8
+4971,mul_452,call_function,mul.Tensor,backward,15,2,2,1,4441,1728,8
+4974,mul_453,call_function,mul.Tensor,backward,15,2,2,1,4444,1726,8
+4975,sub_38,call_function,sub.Tensor,backward,15,2,2,1,4445,1725,10
+4976,mul_454,call_function,mul.Tensor,backward,15,2,2,1,4446,1724,8
+4981,add_228,call_function,add.Tensor,unknown,,2,2,1,4448,1722,10
+4987,einsum_default_374,call_function,einsum.default,backward,15,2,2,1,4451,1716,5
+5006,mul_456,call_function,mul.Tensor,backward,15,2,2,1,4465,1682,8
+5011,mul_457,call_function,mul.Tensor,backward,15,2,2,1,4465,1681,8
+5024,einsum_default_376,call_function,einsum.default,backward,15,2,2,1,4463,1672,5
+5031,einsum_default_378,call_function,einsum.default,backward,15,2,2,1,4472,1672,5
+5032,add_229,call_function,add.Tensor,unknown,,2,2,1,4479,1671,10
+5039,einsum_default_380,call_function,einsum.default,backward,15,2,2,1,4472,1671,5
+5040,add_230,call_function,add.Tensor,unknown,,2,2,1,4495,1670,10
+5049,mul_459,call_function,mul.Tensor,backward,15,2,2,1,1667,1668,8
+5048,mul_458,call_function,mul.Tensor,backward,15,2,2,1,4499,1662,8
+5052,mul_460,call_function,mul.Tensor,backward,15,2,2,1,4504,1660,8
+5055,mul_461,call_function,mul.Tensor,backward,15,2,2,1,4507,1658,8
+5056,sub_39,call_function,sub.Tensor,backward,15,2,2,1,4508,1657,10
+5057,mul_462,call_function,mul.Tensor,backward,15,2,2,1,4509,1656,8
+5062,add_231,call_function,add.Tensor,unknown,,2,2,1,4511,1654,10
+5068,einsum_default_382,call_function,einsum.default,backward,14,2,2,1,4514,1648,5
+5074,mul_465,call_function,mul.Tensor,backward,14,2,2,1,4516,1639,8
+5093,mul_468,call_function,mul.Tensor,backward,14,2,2,1,1645,1638,8
+5091,mul_467,call_function,mul.Tensor,backward,14,2,2,1,4526,1637,8
+5095,mul_469,call_function,mul.Tensor,backward,14,2,2,1,4530,1636,8
+5073,mul_464,call_function,mul.Tensor,backward,14,2,2,1,4516,1635,8
+5078,einsum_default_384,call_function,einsum.default,backward,14,2,2,1,4519,1629,5
+5100,einsum_default_386,call_function,einsum.default,backward,14,2,2,1,4534,1629,5
+5101,add_234,call_function,add.Tensor,unknown,,2,2,1,4539,1628,10
+5110,mul_471,call_function,mul.Tensor,backward,14,2,2,1,1624,1626,8
+5109,mul_470,call_function,mul.Tensor,backward,14,2,2,1,4543,1620,8
+5113,mul_472,call_function,mul.Tensor,backward,14,2,2,1,4548,1618,8
+5116,mul_473,call_function,mul.Tensor,backward,14,2,2,1,4551,1616,8
+5117,sub_41,call_function,sub.Tensor,backward,14,2,2,1,4552,1615,10
+5118,mul_474,call_function,mul.Tensor,backward,14,2,2,1,4553,1614,8
+5123,add_235,call_function,add.Tensor,unknown,,2,2,1,4555,1612,10
+5129,einsum_default_388,call_function,einsum.default,backward,14,2,2,1,4558,1606,5
+5148,mul_476,call_function,mul.Tensor,backward,14,2,2,1,4572,1572,8
+5153,mul_477,call_function,mul.Tensor,backward,14,2,2,1,4572,1571,8
+5166,einsum_default_390,call_function,einsum.default,backward,14,2,2,1,4570,1562,5
+5173,einsum_default_392,call_function,einsum.default,backward,14,2,2,1,4579,1562,5
+5174,add_236,call_function,add.Tensor,unknown,,2,2,1,4586,1561,10
+5181,einsum_default_394,call_function,einsum.default,backward,14,2,2,1,4579,1561,5
+5182,add_237,call_function,add.Tensor,unknown,,2,2,1,4602,1560,10
+5191,mul_479,call_function,mul.Tensor,backward,14,2,2,1,1557,1558,8
+5190,mul_478,call_function,mul.Tensor,backward,14,2,2,1,4606,1552,8
+5194,mul_480,call_function,mul.Tensor,backward,14,2,2,1,4611,1550,8
+5197,mul_481,call_function,mul.Tensor,backward,14,2,2,1,4614,1548,8
+5198,sub_42,call_function,sub.Tensor,backward,14,2,2,1,4615,1547,10
+5199,mul_482,call_function,mul.Tensor,backward,14,2,2,1,4616,1546,8
+5204,add_238,call_function,add.Tensor,unknown,,2,2,1,4618,1544,10
+5210,einsum_default_396,call_function,einsum.default,backward,13,2,2,1,4621,1538,5
+5216,mul_485,call_function,mul.Tensor,backward,13,2,2,1,4623,1529,8
+5235,mul_488,call_function,mul.Tensor,backward,13,2,2,1,1535,1528,8
+5233,mul_487,call_function,mul.Tensor,backward,13,2,2,1,4633,1527,8
+5237,mul_489,call_function,mul.Tensor,backward,13,2,2,1,4637,1526,8
+5215,mul_484,call_function,mul.Tensor,backward,13,2,2,1,4623,1525,8
+5220,einsum_default_398,call_function,einsum.default,backward,13,2,2,1,4626,1519,5
+5242,einsum_default_400,call_function,einsum.default,backward,13,2,2,1,4641,1519,5
+5243,add_241,call_function,add.Tensor,unknown,,2,2,1,4646,1518,10
+5252,mul_491,call_function,mul.Tensor,backward,13,2,2,1,1514,1516,8
+5251,mul_490,call_function,mul.Tensor,backward,13,2,2,1,4650,1510,8
+5255,mul_492,call_function,mul.Tensor,backward,13,2,2,1,4655,1508,8
+5258,mul_493,call_function,mul.Tensor,backward,13,2,2,1,4658,1506,8
+5259,sub_44,call_function,sub.Tensor,backward,13,2,2,1,4659,1505,10
+5260,mul_494,call_function,mul.Tensor,backward,13,2,2,1,4660,1504,8
+5265,add_242,call_function,add.Tensor,unknown,,2,2,1,4662,1502,10
+5271,einsum_default_402,call_function,einsum.default,backward,13,2,2,1,4665,1496,5
+5290,mul_496,call_function,mul.Tensor,backward,13,2,2,1,4679,1462,8
+5295,mul_497,call_function,mul.Tensor,backward,13,2,2,1,4679,1461,8
+5308,einsum_default_404,call_function,einsum.default,backward,13,2,2,1,4677,1452,5
+5315,einsum_default_406,call_function,einsum.default,backward,13,2,2,1,4686,1452,5
+5316,add_243,call_function,add.Tensor,unknown,,2,2,1,4693,1451,10
+5323,einsum_default_408,call_function,einsum.default,backward,13,2,2,1,4686,1451,5
+5324,add_244,call_function,add.Tensor,unknown,,2,2,1,4709,1450,10
+5333,mul_499,call_function,mul.Tensor,backward,13,2,2,1,1447,1448,8
+5332,mul_498,call_function,mul.Tensor,backward,13,2,2,1,4713,1442,8
+5336,mul_500,call_function,mul.Tensor,backward,13,2,2,1,4718,1440,8
+5339,mul_501,call_function,mul.Tensor,backward,13,2,2,1,4721,1438,8
+5340,sub_45,call_function,sub.Tensor,backward,13,2,2,1,4722,1437,10
+5341,mul_502,call_function,mul.Tensor,backward,13,2,2,1,4723,1436,8
+5346,add_245,call_function,add.Tensor,unknown,,2,2,1,4725,1434,10
+5352,einsum_default_410,call_function,einsum.default,backward,12,2,2,1,4728,1428,5
+5358,mul_505,call_function,mul.Tensor,backward,12,2,2,1,4730,1419,8
+5377,mul_508,call_function,mul.Tensor,backward,12,2,2,1,1425,1418,8
+5375,mul_507,call_function,mul.Tensor,backward,12,2,2,1,4740,1417,8
+5379,mul_509,call_function,mul.Tensor,backward,12,2,2,1,4744,1416,8
+5357,mul_504,call_function,mul.Tensor,backward,12,2,2,1,4730,1415,8
+5362,einsum_default_412,call_function,einsum.default,backward,12,2,2,1,4733,1409,5
+5384,einsum_default_414,call_function,einsum.default,backward,12,2,2,1,4748,1409,5
+5385,add_248,call_function,add.Tensor,unknown,,2,2,1,4753,1408,10
+5394,mul_511,call_function,mul.Tensor,backward,12,2,2,1,1404,1406,8
+5393,mul_510,call_function,mul.Tensor,backward,12,2,2,1,4757,1400,8
+5397,mul_512,call_function,mul.Tensor,backward,12,2,2,1,4762,1398,8
+5400,mul_513,call_function,mul.Tensor,backward,12,2,2,1,4765,1396,8
+5401,sub_47,call_function,sub.Tensor,backward,12,2,2,1,4766,1395,10
+5402,mul_514,call_function,mul.Tensor,backward,12,2,2,1,4767,1394,8
+5407,add_249,call_function,add.Tensor,unknown,,2,2,1,4769,1392,10
+5413,einsum_default_416,call_function,einsum.default,backward,12,2,2,1,4772,1386,5
+5432,mul_516,call_function,mul.Tensor,backward,12,2,2,1,4786,1352,8
+5437,mul_517,call_function,mul.Tensor,backward,12,2,2,1,4786,1351,8
+5450,einsum_default_418,call_function,einsum.default,backward,12,2,2,1,4784,1342,5
+5457,einsum_default_420,call_function,einsum.default,backward,12,2,2,1,4793,1342,5
+5458,add_250,call_function,add.Tensor,unknown,,2,2,1,4800,1341,10
+5465,einsum_default_422,call_function,einsum.default,backward,12,2,2,1,4793,1341,5
+5466,add_251,call_function,add.Tensor,unknown,,2,2,1,4816,1340,10
+5475,mul_519,call_function,mul.Tensor,backward,12,2,2,1,1337,1338,8
+5474,mul_518,call_function,mul.Tensor,backward,12,2,2,1,4820,1332,8
+5478,mul_520,call_function,mul.Tensor,backward,12,2,2,1,4825,1330,8
+5481,mul_521,call_function,mul.Tensor,backward,12,2,2,1,4828,1328,8
+5482,sub_48,call_function,sub.Tensor,backward,12,2,2,1,4829,1327,10
+5483,mul_522,call_function,mul.Tensor,backward,12,2,2,1,4830,1326,8
+5488,add_252,call_function,add.Tensor,unknown,,2,2,1,4832,1324,10
+5494,einsum_default_424,call_function,einsum.default,backward,11,2,2,1,4835,1318,5
+5500,mul_525,call_function,mul.Tensor,backward,11,2,2,1,4837,1309,8
+5519,mul_528,call_function,mul.Tensor,backward,11,2,2,1,1315,1308,8
+5517,mul_527,call_function,mul.Tensor,backward,11,2,2,1,4847,1307,8
+5521,mul_529,call_function,mul.Tensor,backward,11,2,2,1,4851,1306,8
+5499,mul_524,call_function,mul.Tensor,backward,11,2,2,1,4837,1305,8
+5504,einsum_default_426,call_function,einsum.default,backward,11,2,2,1,4840,1299,5
+5526,einsum_default_428,call_function,einsum.default,backward,11,2,2,1,4855,1299,5
+5527,add_255,call_function,add.Tensor,unknown,,2,2,1,4860,1298,10
+5536,mul_531,call_function,mul.Tensor,backward,11,2,2,1,1294,1296,8
+5535,mul_530,call_function,mul.Tensor,backward,11,2,2,1,4864,1290,8
+5539,mul_532,call_function,mul.Tensor,backward,11,2,2,1,4869,1288,8
+5542,mul_533,call_function,mul.Tensor,backward,11,2,2,1,4872,1286,8
+5543,sub_50,call_function,sub.Tensor,backward,11,2,2,1,4873,1285,10
+5544,mul_534,call_function,mul.Tensor,backward,11,2,2,1,4874,1284,8
+5549,add_256,call_function,add.Tensor,unknown,,2,2,1,4876,1282,10
+5555,einsum_default_430,call_function,einsum.default,backward,11,2,2,1,4879,1276,5
+5574,mul_536,call_function,mul.Tensor,backward,11,2,2,1,4893,1242,8
+5579,mul_537,call_function,mul.Tensor,backward,11,2,2,1,4893,1241,8
+5592,einsum_default_432,call_function,einsum.default,backward,11,2,2,1,4891,1232,5
+5599,einsum_default_434,call_function,einsum.default,backward,11,2,2,1,4900,1232,5
+5600,add_257,call_function,add.Tensor,unknown,,2,2,1,4907,1231,10
+5607,einsum_default_436,call_function,einsum.default,backward,11,2,2,1,4900,1231,5
+5608,add_258,call_function,add.Tensor,unknown,,2,2,1,4923,1230,10
+5617,mul_539,call_function,mul.Tensor,backward,11,2,2,1,1227,1228,8
+5616,mul_538,call_function,mul.Tensor,backward,11,2,2,1,4927,1222,8
+5620,mul_540,call_function,mul.Tensor,backward,11,2,2,1,4932,1220,8
+5623,mul_541,call_function,mul.Tensor,backward,11,2,2,1,4935,1218,8
+5624,sub_51,call_function,sub.Tensor,backward,11,2,2,1,4936,1217,10
+5625,mul_542,call_function,mul.Tensor,backward,11,2,2,1,4937,1216,8
+5630,add_259,call_function,add.Tensor,unknown,,2,2,1,4939,1214,10
+5636,einsum_default_438,call_function,einsum.default,backward,10,2,2,1,4942,1208,5
+5642,mul_545,call_function,mul.Tensor,backward,10,2,2,1,4944,1199,8
+5661,mul_548,call_function,mul.Tensor,backward,10,2,2,1,1205,1198,8
+5659,mul_547,call_function,mul.Tensor,backward,10,2,2,1,4954,1197,8
+5663,mul_549,call_function,mul.Tensor,backward,10,2,2,1,4958,1196,8
+5641,mul_544,call_function,mul.Tensor,backward,10,2,2,1,4944,1195,8
+5646,einsum_default_440,call_function,einsum.default,backward,10,2,2,1,4947,1189,5
+5668,einsum_default_442,call_function,einsum.default,backward,10,2,2,1,4962,1189,5
+5669,add_262,call_function,add.Tensor,unknown,,2,2,1,4967,1188,10
+5678,mul_551,call_function,mul.Tensor,backward,10,2,2,1,1184,1186,8
+5677,mul_550,call_function,mul.Tensor,backward,10,2,2,1,4971,1180,8
+5681,mul_552,call_function,mul.Tensor,backward,10,2,2,1,4976,1178,8
+5684,mul_553,call_function,mul.Tensor,backward,10,2,2,1,4979,1176,8
+5685,sub_53,call_function,sub.Tensor,backward,10,2,2,1,4980,1175,10
+5686,mul_554,call_function,mul.Tensor,backward,10,2,2,1,4981,1174,8
+5691,add_263,call_function,add.Tensor,unknown,,2,2,1,4983,1172,10
+5697,einsum_default_444,call_function,einsum.default,backward,10,2,2,1,4986,1166,5
+5716,mul_556,call_function,mul.Tensor,backward,10,2,2,1,5000,1132,8
+5721,mul_557,call_function,mul.Tensor,backward,10,2,2,1,5000,1131,8
+5734,einsum_default_446,call_function,einsum.default,backward,10,2,2,1,4998,1122,5
+5741,einsum_default_448,call_function,einsum.default,backward,10,2,2,1,5007,1122,5
+5742,add_264,call_function,add.Tensor,unknown,,2,2,1,5014,1121,10
+5749,einsum_default_450,call_function,einsum.default,backward,10,2,2,1,5007,1121,5
+5750,add_265,call_function,add.Tensor,unknown,,2,2,1,5030,1120,10
+5759,mul_559,call_function,mul.Tensor,backward,10,2,2,1,1117,1118,8
+5758,mul_558,call_function,mul.Tensor,backward,10,2,2,1,5034,1112,8
+5762,mul_560,call_function,mul.Tensor,backward,10,2,2,1,5039,1110,8
+5765,mul_561,call_function,mul.Tensor,backward,10,2,2,1,5042,1108,8
+5766,sub_54,call_function,sub.Tensor,backward,10,2,2,1,5043,1107,10
+5767,mul_562,call_function,mul.Tensor,backward,10,2,2,1,5044,1106,8
+5772,add_266,call_function,add.Tensor,unknown,,2,2,1,5046,1104,10
+5778,einsum_default_452,call_function,einsum.default,backward,9,2,2,1,5049,1098,5
+5784,mul_565,call_function,mul.Tensor,backward,9,2,2,1,5051,1089,8
+5803,mul_568,call_function,mul.Tensor,backward,9,2,2,1,1095,1088,8
+5801,mul_567,call_function,mul.Tensor,backward,9,2,2,1,5061,1087,8
+5805,mul_569,call_function,mul.Tensor,backward,9,2,2,1,5065,1086,8
+5783,mul_564,call_function,mul.Tensor,backward,9,2,2,1,5051,1085,8
+5788,einsum_default_454,call_function,einsum.default,backward,9,2,2,1,5054,1079,5
+5810,einsum_default_456,call_function,einsum.default,backward,9,2,2,1,5069,1079,5
+5811,add_269,call_function,add.Tensor,unknown,,2,2,1,5074,1078,10
+5820,mul_571,call_function,mul.Tensor,backward,9,2,2,1,1074,1076,8
+5819,mul_570,call_function,mul.Tensor,backward,9,2,2,1,5078,1070,8
+5823,mul_572,call_function,mul.Tensor,backward,9,2,2,1,5083,1068,8
+5826,mul_573,call_function,mul.Tensor,backward,9,2,2,1,5086,1066,8
+5827,sub_56,call_function,sub.Tensor,backward,9,2,2,1,5087,1065,10
+5828,mul_574,call_function,mul.Tensor,backward,9,2,2,1,5088,1064,8
+5833,add_270,call_function,add.Tensor,unknown,,2,2,1,5090,1062,10
+5839,einsum_default_458,call_function,einsum.default,backward,9,2,2,1,5093,1056,5
+5858,mul_576,call_function,mul.Tensor,backward,9,2,2,1,5107,1022,8
+5863,mul_577,call_function,mul.Tensor,backward,9,2,2,1,5107,1021,8
+5876,einsum_default_460,call_function,einsum.default,backward,9,2,2,1,5105,1012,5
+5883,einsum_default_462,call_function,einsum.default,backward,9,2,2,1,5114,1012,5
+5884,add_271,call_function,add.Tensor,unknown,,2,2,1,5121,1011,10
+5891,einsum_default_464,call_function,einsum.default,backward,9,2,2,1,5114,1011,5
+5892,add_272,call_function,add.Tensor,unknown,,2,2,1,5137,1010,10
+5901,mul_579,call_function,mul.Tensor,backward,9,2,2,1,1007,1008,8
+5900,mul_578,call_function,mul.Tensor,backward,9,2,2,1,5141,1002,8
+5904,mul_580,call_function,mul.Tensor,backward,9,2,2,1,5146,1000,8
+5907,mul_581,call_function,mul.Tensor,backward,9,2,2,1,5149,998,8
+5908,sub_57,call_function,sub.Tensor,backward,9,2,2,1,5150,997,10
+5909,mul_582,call_function,mul.Tensor,backward,9,2,2,1,5151,996,8
+5914,add_273,call_function,add.Tensor,unknown,,2,2,1,5153,994,10
+5920,einsum_default_466,call_function,einsum.default,backward,8,2,2,1,5156,988,5
+5926,mul_585,call_function,mul.Tensor,backward,8,2,2,1,5158,979,8
+5945,mul_588,call_function,mul.Tensor,backward,8,2,2,1,985,978,8
+5943,mul_587,call_function,mul.Tensor,backward,8,2,2,1,5168,977,8
+5947,mul_589,call_function,mul.Tensor,backward,8,2,2,1,5172,976,8
+5925,mul_584,call_function,mul.Tensor,backward,8,2,2,1,5158,975,8
+5930,einsum_default_468,call_function,einsum.default,backward,8,2,2,1,5161,969,5
+5952,einsum_default_470,call_function,einsum.default,backward,8,2,2,1,5176,969,5
+5953,add_276,call_function,add.Tensor,unknown,,2,2,1,5181,968,10
+5962,mul_591,call_function,mul.Tensor,backward,8,2,2,1,964,966,8
+5961,mul_590,call_function,mul.Tensor,backward,8,2,2,1,5185,960,8
+5965,mul_592,call_function,mul.Tensor,backward,8,2,2,1,5190,958,8
+5968,mul_593,call_function,mul.Tensor,backward,8,2,2,1,5193,956,8
+5969,sub_59,call_function,sub.Tensor,backward,8,2,2,1,5194,955,10
+5970,mul_594,call_function,mul.Tensor,backward,8,2,2,1,5195,954,8
+5975,add_277,call_function,add.Tensor,unknown,,2,2,1,5197,952,10
+5981,einsum_default_472,call_function,einsum.default,backward,8,2,2,1,5200,946,5
+6000,mul_596,call_function,mul.Tensor,backward,8,2,2,1,5214,912,8
+6005,mul_597,call_function,mul.Tensor,backward,8,2,2,1,5214,911,8
+6018,einsum_default_474,call_function,einsum.default,backward,8,2,2,1,5212,902,5
+6025,einsum_default_476,call_function,einsum.default,backward,8,2,2,1,5221,902,5
+6026,add_278,call_function,add.Tensor,unknown,,2,2,1,5228,901,10
+6033,einsum_default_478,call_function,einsum.default,backward,8,2,2,1,5221,901,5
+6034,add_279,call_function,add.Tensor,unknown,,2,2,1,5244,900,10
+6043,mul_599,call_function,mul.Tensor,backward,8,2,2,1,897,898,8
+6042,mul_598,call_function,mul.Tensor,backward,8,2,2,1,5248,892,8
+6046,mul_600,call_function,mul.Tensor,backward,8,2,2,1,5253,890,8
+6049,mul_601,call_function,mul.Tensor,backward,8,2,2,1,5256,888,8
+6050,sub_60,call_function,sub.Tensor,backward,8,2,2,1,5257,887,10
+6051,mul_602,call_function,mul.Tensor,backward,8,2,2,1,5258,886,8
+6056,add_280,call_function,add.Tensor,unknown,,2,2,1,5260,884,10
+6062,einsum_default_480,call_function,einsum.default,backward,7,2,2,1,5263,878,5
+6068,mul_605,call_function,mul.Tensor,backward,7,2,2,1,5265,869,8
+6087,mul_608,call_function,mul.Tensor,backward,7,2,2,1,875,868,8
+6085,mul_607,call_function,mul.Tensor,backward,7,2,2,1,5275,867,8
+6089,mul_609,call_function,mul.Tensor,backward,7,2,2,1,5279,866,8
+6067,mul_604,call_function,mul.Tensor,backward,7,2,2,1,5265,865,8
+6072,einsum_default_482,call_function,einsum.default,backward,7,2,2,1,5268,859,5
+6094,einsum_default_484,call_function,einsum.default,backward,7,2,2,1,5283,859,5
+6095,add_283,call_function,add.Tensor,unknown,,2,2,1,5288,858,10
+6104,mul_611,call_function,mul.Tensor,backward,7,2,2,1,854,856,8
+6103,mul_610,call_function,mul.Tensor,backward,7,2,2,1,5292,850,8
+6107,mul_612,call_function,mul.Tensor,backward,7,2,2,1,5297,848,8
+6110,mul_613,call_function,mul.Tensor,backward,7,2,2,1,5300,846,8
+6111,sub_62,call_function,sub.Tensor,backward,7,2,2,1,5301,845,10
+6112,mul_614,call_function,mul.Tensor,backward,7,2,2,1,5302,844,8
+6117,add_284,call_function,add.Tensor,unknown,,2,2,1,5304,842,10
+6123,einsum_default_486,call_function,einsum.default,backward,7,2,2,1,5307,836,5
+6142,mul_616,call_function,mul.Tensor,backward,7,2,2,1,5321,802,8
+6147,mul_617,call_function,mul.Tensor,backward,7,2,2,1,5321,801,8
+6160,einsum_default_488,call_function,einsum.default,backward,7,2,2,1,5319,792,5
+6167,einsum_default_490,call_function,einsum.default,backward,7,2,2,1,5328,792,5
+6168,add_285,call_function,add.Tensor,unknown,,2,2,1,5335,791,10
+6175,einsum_default_492,call_function,einsum.default,backward,7,2,2,1,5328,791,5
+6176,add_286,call_function,add.Tensor,unknown,,2,2,1,5351,790,10
+6185,mul_619,call_function,mul.Tensor,backward,7,2,2,1,787,788,8
+6184,mul_618,call_function,mul.Tensor,backward,7,2,2,1,5355,782,8
+6188,mul_620,call_function,mul.Tensor,backward,7,2,2,1,5360,780,8
+6191,mul_621,call_function,mul.Tensor,backward,7,2,2,1,5363,778,8
+6192,sub_63,call_function,sub.Tensor,backward,7,2,2,1,5364,777,10
+6193,mul_622,call_function,mul.Tensor,backward,7,2,2,1,5365,776,8
+6198,add_287,call_function,add.Tensor,unknown,,2,2,1,5367,774,10
+6204,einsum_default_494,call_function,einsum.default,backward,6,2,2,1,5370,768,5
+6210,mul_625,call_function,mul.Tensor,backward,6,2,2,1,5372,759,8
+6229,mul_628,call_function,mul.Tensor,backward,6,2,2,1,765,758,8
+6227,mul_627,call_function,mul.Tensor,backward,6,2,2,1,5382,757,8
+6231,mul_629,call_function,mul.Tensor,backward,6,2,2,1,5386,756,8
+6209,mul_624,call_function,mul.Tensor,backward,6,2,2,1,5372,755,8
+6214,einsum_default_496,call_function,einsum.default,backward,6,2,2,1,5375,749,5
+6236,einsum_default_498,call_function,einsum.default,backward,6,2,2,1,5390,749,5
+6237,add_290,call_function,add.Tensor,unknown,,2,2,1,5395,748,10
+6246,mul_631,call_function,mul.Tensor,backward,6,2,2,1,744,746,8
+6245,mul_630,call_function,mul.Tensor,backward,6,2,2,1,5399,740,8
+6249,mul_632,call_function,mul.Tensor,backward,6,2,2,1,5404,738,8
+6252,mul_633,call_function,mul.Tensor,backward,6,2,2,1,5407,736,8
+6253,sub_65,call_function,sub.Tensor,backward,6,2,2,1,5408,735,10
+6254,mul_634,call_function,mul.Tensor,backward,6,2,2,1,5409,734,8
+6259,add_291,call_function,add.Tensor,unknown,,2,2,1,5411,732,10
+6265,einsum_default_500,call_function,einsum.default,backward,6,2,2,1,5414,726,5
+6284,mul_636,call_function,mul.Tensor,backward,6,2,2,1,5428,692,8
+6289,mul_637,call_function,mul.Tensor,backward,6,2,2,1,5428,691,8
+6302,einsum_default_502,call_function,einsum.default,backward,6,2,2,1,5426,682,5
+6309,einsum_default_504,call_function,einsum.default,backward,6,2,2,1,5435,682,5
+6310,add_292,call_function,add.Tensor,unknown,,2,2,1,5442,681,10
+6317,einsum_default_506,call_function,einsum.default,backward,6,2,2,1,5435,681,5
+6318,add_293,call_function,add.Tensor,unknown,,2,2,1,5458,680,10
+6327,mul_639,call_function,mul.Tensor,backward,6,2,2,1,677,678,8
+6326,mul_638,call_function,mul.Tensor,backward,6,2,2,1,5462,672,8
+6330,mul_640,call_function,mul.Tensor,backward,6,2,2,1,5467,670,8
+6333,mul_641,call_function,mul.Tensor,backward,6,2,2,1,5470,668,8
+6334,sub_66,call_function,sub.Tensor,backward,6,2,2,1,5471,667,10
+6335,mul_642,call_function,mul.Tensor,backward,6,2,2,1,5472,666,8
+6340,add_294,call_function,add.Tensor,unknown,,2,2,1,5474,664,10
+6346,einsum_default_508,call_function,einsum.default,backward,5,2,2,1,5477,658,5
+6352,mul_645,call_function,mul.Tensor,backward,5,2,2,1,5479,649,8
+6371,mul_648,call_function,mul.Tensor,backward,5,2,2,1,655,648,8
+6369,mul_647,call_function,mul.Tensor,backward,5,2,2,1,5489,647,8
+6373,mul_649,call_function,mul.Tensor,backward,5,2,2,1,5493,646,8
+6351,mul_644,call_function,mul.Tensor,backward,5,2,2,1,5479,645,8
+6356,einsum_default_510,call_function,einsum.default,backward,5,2,2,1,5482,639,5
+6378,einsum_default_512,call_function,einsum.default,backward,5,2,2,1,5497,639,5
+6379,add_297,call_function,add.Tensor,unknown,,2,2,1,5502,638,10
+6388,mul_651,call_function,mul.Tensor,backward,5,2,2,1,634,636,8
+6387,mul_650,call_function,mul.Tensor,backward,5,2,2,1,5506,630,8
+6391,mul_652,call_function,mul.Tensor,backward,5,2,2,1,5511,628,8
+6394,mul_653,call_function,mul.Tensor,backward,5,2,2,1,5514,626,8
+6395,sub_68,call_function,sub.Tensor,backward,5,2,2,1,5515,625,10
+6396,mul_654,call_function,mul.Tensor,backward,5,2,2,1,5516,624,8
+6401,add_298,call_function,add.Tensor,unknown,,2,2,1,5518,622,10
+6407,einsum_default_514,call_function,einsum.default,backward,5,2,2,1,5521,616,5
+6426,mul_656,call_function,mul.Tensor,backward,5,2,2,1,5535,582,8
+6431,mul_657,call_function,mul.Tensor,backward,5,2,2,1,5535,581,8
+6444,einsum_default_516,call_function,einsum.default,backward,5,2,2,1,5533,572,5
+6451,einsum_default_518,call_function,einsum.default,backward,5,2,2,1,5542,572,5
+6452,add_299,call_function,add.Tensor,unknown,,2,2,1,5549,571,10
+6459,einsum_default_520,call_function,einsum.default,backward,5,2,2,1,5542,571,5
+6460,add_300,call_function,add.Tensor,unknown,,2,2,1,5565,570,10
+6469,mul_659,call_function,mul.Tensor,backward,5,2,2,1,567,568,8
+6468,mul_658,call_function,mul.Tensor,backward,5,2,2,1,5569,562,8
+6472,mul_660,call_function,mul.Tensor,backward,5,2,2,1,5574,560,8
+6475,mul_661,call_function,mul.Tensor,backward,5,2,2,1,5577,558,8
+6476,sub_69,call_function,sub.Tensor,backward,5,2,2,1,5578,557,10
+6477,mul_662,call_function,mul.Tensor,backward,5,2,2,1,5579,556,8
+6482,add_301,call_function,add.Tensor,unknown,,2,2,1,5581,554,10
+6488,einsum_default_522,call_function,einsum.default,backward,4,2,2,1,5584,548,5
+6494,mul_665,call_function,mul.Tensor,backward,4,2,2,1,5586,539,8
+6513,mul_668,call_function,mul.Tensor,backward,4,2,2,1,545,538,8
+6511,mul_667,call_function,mul.Tensor,backward,4,2,2,1,5596,537,8
+6515,mul_669,call_function,mul.Tensor,backward,4,2,2,1,5600,536,8
+6493,mul_664,call_function,mul.Tensor,backward,4,2,2,1,5586,535,8
+6498,einsum_default_524,call_function,einsum.default,backward,4,2,2,1,5589,529,5
+6520,einsum_default_526,call_function,einsum.default,backward,4,2,2,1,5604,529,5
+6521,add_304,call_function,add.Tensor,unknown,,2,2,1,5609,528,10
+6530,mul_671,call_function,mul.Tensor,backward,4,2,2,1,524,526,8
+6529,mul_670,call_function,mul.Tensor,backward,4,2,2,1,5613,520,8
+6533,mul_672,call_function,mul.Tensor,backward,4,2,2,1,5618,518,8
+6536,mul_673,call_function,mul.Tensor,backward,4,2,2,1,5621,516,8
+6537,sub_71,call_function,sub.Tensor,backward,4,2,2,1,5622,515,10
+6538,mul_674,call_function,mul.Tensor,backward,4,2,2,1,5623,514,8
+6543,add_305,call_function,add.Tensor,unknown,,2,2,1,5625,512,10
+6549,einsum_default_528,call_function,einsum.default,backward,4,2,2,1,5628,506,5
+6568,mul_676,call_function,mul.Tensor,backward,4,2,2,1,5642,472,8
+6573,mul_677,call_function,mul.Tensor,backward,4,2,2,1,5642,471,8
+6586,einsum_default_530,call_function,einsum.default,backward,4,2,2,1,5640,462,5
+6593,einsum_default_532,call_function,einsum.default,backward,4,2,2,1,5649,462,5
+6594,add_306,call_function,add.Tensor,unknown,,2,2,1,5656,461,10
+6601,einsum_default_534,call_function,einsum.default,backward,4,2,2,1,5649,461,5
+6602,add_307,call_function,add.Tensor,unknown,,2,2,1,5672,460,10
+6611,mul_679,call_function,mul.Tensor,backward,4,2,2,1,457,458,8
+6610,mul_678,call_function,mul.Tensor,backward,4,2,2,1,5676,452,8
+6614,mul_680,call_function,mul.Tensor,backward,4,2,2,1,5681,450,8
+6617,mul_681,call_function,mul.Tensor,backward,4,2,2,1,5684,448,8
+6618,sub_72,call_function,sub.Tensor,backward,4,2,2,1,5685,447,10
+6619,mul_682,call_function,mul.Tensor,backward,4,2,2,1,5686,446,8
+6624,add_308,call_function,add.Tensor,unknown,,2,2,1,5688,444,10
+6630,einsum_default_536,call_function,einsum.default,backward,3,2,2,1,5691,438,5
+6636,mul_685,call_function,mul.Tensor,backward,3,2,2,1,5693,429,8
+6655,mul_688,call_function,mul.Tensor,backward,3,2,2,1,435,428,8
+6653,mul_687,call_function,mul.Tensor,backward,3,2,2,1,5703,427,8
+6657,mul_689,call_function,mul.Tensor,backward,3,2,2,1,5707,426,8
+6635,mul_684,call_function,mul.Tensor,backward,3,2,2,1,5693,425,8
+6640,einsum_default_538,call_function,einsum.default,backward,3,2,2,1,5696,419,5
+6662,einsum_default_540,call_function,einsum.default,backward,3,2,2,1,5711,419,5
+6663,add_311,call_function,add.Tensor,unknown,,2,2,1,5716,418,10
+6672,mul_691,call_function,mul.Tensor,backward,3,2,2,1,414,416,8
+6671,mul_690,call_function,mul.Tensor,backward,3,2,2,1,5720,410,8
+6675,mul_692,call_function,mul.Tensor,backward,3,2,2,1,5725,408,8
+6678,mul_693,call_function,mul.Tensor,backward,3,2,2,1,5728,406,8
+6679,sub_74,call_function,sub.Tensor,backward,3,2,2,1,5729,405,10
+6680,mul_694,call_function,mul.Tensor,backward,3,2,2,1,5730,404,8
+6685,add_312,call_function,add.Tensor,unknown,,2,2,1,5732,402,10
+6691,einsum_default_542,call_function,einsum.default,backward,3,2,2,1,5735,396,5
+6710,mul_696,call_function,mul.Tensor,backward,3,2,2,1,5749,362,8
+6715,mul_697,call_function,mul.Tensor,backward,3,2,2,1,5749,361,8
+6728,einsum_default_544,call_function,einsum.default,backward,3,2,2,1,5747,352,5
+6735,einsum_default_546,call_function,einsum.default,backward,3,2,2,1,5756,352,5
+6736,add_313,call_function,add.Tensor,unknown,,2,2,1,5763,351,10
+6743,einsum_default_548,call_function,einsum.default,backward,3,2,2,1,5756,351,5
+6744,add_314,call_function,add.Tensor,unknown,,2,2,1,5779,350,10
+6753,mul_699,call_function,mul.Tensor,backward,3,2,2,1,347,348,8
+6752,mul_698,call_function,mul.Tensor,backward,3,2,2,1,5783,342,8
+6756,mul_700,call_function,mul.Tensor,backward,3,2,2,1,5788,340,8
+6759,mul_701,call_function,mul.Tensor,backward,3,2,2,1,5791,338,8
+6760,sub_75,call_function,sub.Tensor,backward,3,2,2,1,5792,337,10
+6761,mul_702,call_function,mul.Tensor,backward,3,2,2,1,5793,336,8
+6766,add_315,call_function,add.Tensor,unknown,,2,2,1,5795,334,10
+6772,einsum_default_550,call_function,einsum.default,backward,2,2,2,1,5798,328,5
+6778,mul_705,call_function,mul.Tensor,backward,2,2,2,1,5800,319,8
+6797,mul_708,call_function,mul.Tensor,backward,2,2,2,1,325,318,8
+6795,mul_707,call_function,mul.Tensor,backward,2,2,2,1,5810,317,8
+6799,mul_709,call_function,mul.Tensor,backward,2,2,2,1,5814,316,8
+6777,mul_704,call_function,mul.Tensor,backward,2,2,2,1,5800,315,8
+6782,einsum_default_552,call_function,einsum.default,backward,2,2,2,1,5803,309,5
+6804,einsum_default_554,call_function,einsum.default,backward,2,2,2,1,5818,309,5
+6805,add_318,call_function,add.Tensor,unknown,,2,2,1,5823,308,10
+6814,mul_711,call_function,mul.Tensor,backward,2,2,2,1,304,306,8
+6813,mul_710,call_function,mul.Tensor,backward,2,2,2,1,5827,300,8
+6817,mul_712,call_function,mul.Tensor,backward,2,2,2,1,5832,298,8
+6820,mul_713,call_function,mul.Tensor,backward,2,2,2,1,5835,296,8
+6821,sub_77,call_function,sub.Tensor,backward,2,2,2,1,5836,295,10
+6822,mul_714,call_function,mul.Tensor,backward,2,2,2,1,5837,294,8
+6827,add_319,call_function,add.Tensor,unknown,,2,2,1,5839,292,10
+6833,einsum_default_556,call_function,einsum.default,backward,2,2,2,1,5842,286,5
+6852,mul_716,call_function,mul.Tensor,backward,2,2,2,1,5856,252,8
+6857,mul_717,call_function,mul.Tensor,backward,2,2,2,1,5856,251,8
+6870,einsum_default_558,call_function,einsum.default,backward,2,2,2,1,5854,242,5
+6877,einsum_default_560,call_function,einsum.default,backward,2,2,2,1,5863,242,5
+6878,add_320,call_function,add.Tensor,unknown,,2,2,1,5870,241,10
+6885,einsum_default_562,call_function,einsum.default,backward,2,2,2,1,5863,241,5
+6886,add_321,call_function,add.Tensor,unknown,,2,2,1,5886,240,10
+6895,mul_719,call_function,mul.Tensor,backward,2,2,2,1,237,238,8
+6894,mul_718,call_function,mul.Tensor,backward,2,2,2,1,5890,232,8
+6898,mul_720,call_function,mul.Tensor,backward,2,2,2,1,5895,230,8
+6901,mul_721,call_function,mul.Tensor,backward,2,2,2,1,5898,228,8
+6902,sub_78,call_function,sub.Tensor,backward,2,2,2,1,5899,227,10
+6903,mul_722,call_function,mul.Tensor,backward,2,2,2,1,5900,226,8
+6908,add_322,call_function,add.Tensor,unknown,,2,2,1,5902,224,10
+6914,einsum_default_564,call_function,einsum.default,backward,1,2,2,1,5905,218,5
+6920,mul_725,call_function,mul.Tensor,backward,1,2,2,1,5907,209,8
+6939,mul_728,call_function,mul.Tensor,backward,1,2,2,1,215,208,8
+6937,mul_727,call_function,mul.Tensor,backward,1,2,2,1,5917,207,8
+6941,mul_729,call_function,mul.Tensor,backward,1,2,2,1,5921,206,8
+6919,mul_724,call_function,mul.Tensor,backward,1,2,2,1,5907,205,8
+6924,einsum_default_566,call_function,einsum.default,backward,1,2,2,1,5910,199,5
+6946,einsum_default_568,call_function,einsum.default,backward,1,2,2,1,5925,199,5
+6947,add_325,call_function,add.Tensor,unknown,,2,2,1,5930,198,10
+6956,mul_731,call_function,mul.Tensor,backward,1,2,2,1,194,196,8
+6955,mul_730,call_function,mul.Tensor,backward,1,2,2,1,5934,190,8
+6959,mul_732,call_function,mul.Tensor,backward,1,2,2,1,5939,188,8
+6962,mul_733,call_function,mul.Tensor,backward,1,2,2,1,5942,186,8
+6963,sub_80,call_function,sub.Tensor,backward,1,2,2,1,5943,185,10
+6964,mul_734,call_function,mul.Tensor,backward,1,2,2,1,5944,184,8
+6969,add_326,call_function,add.Tensor,unknown,,2,2,1,5946,182,10
+6975,einsum_default_570,call_function,einsum.default,backward,1,2,2,1,5949,176,5
+6994,mul_736,call_function,mul.Tensor,backward,1,2,2,1,5963,142,8
+6999,mul_737,call_function,mul.Tensor,backward,1,2,2,1,5963,141,8
+7012,einsum_default_572,call_function,einsum.default,backward,1,2,2,1,5961,132,5
+7019,einsum_default_574,call_function,einsum.default,backward,1,2,2,1,5970,132,5
+7020,add_327,call_function,add.Tensor,unknown,,2,2,1,5977,131,10
+7027,einsum_default_576,call_function,einsum.default,backward,1,2,2,1,5970,131,5
+7028,add_328,call_function,add.Tensor,unknown,,2,2,1,5993,130,10
+7037,mul_739,call_function,mul.Tensor,backward,1,2,2,1,127,128,8
+7036,mul_738,call_function,mul.Tensor,backward,1,2,2,1,5997,122,8
+7040,mul_740,call_function,mul.Tensor,backward,1,2,2,1,6002,120,8
+7043,mul_741,call_function,mul.Tensor,backward,1,2,2,1,6005,118,8
+7044,sub_81,call_function,sub.Tensor,backward,1,2,2,1,6006,117,10
+7045,mul_742,call_function,mul.Tensor,backward,1,2,2,1,6007,116,8
+7050,add_329,call_function,add.Tensor,unknown,,2,2,1,6009,114,10
+7056,einsum_default_578,call_function,einsum.default,backward,0,2,2,1,6012,108,5
+7062,mul_745,call_function,mul.Tensor,backward,0,2,2,1,6014,99,8
+7081,mul_748,call_function,mul.Tensor,backward,0,2,2,1,105,98,8
+7079,mul_747,call_function,mul.Tensor,backward,0,2,2,1,6024,97,8
+7083,mul_749,call_function,mul.Tensor,backward,0,2,2,1,6028,96,8
+7061,mul_744,call_function,mul.Tensor,backward,0,2,2,1,6014,95,8
+7066,einsum_default_580,call_function,einsum.default,backward,0,2,2,1,6017,89,5
+7088,einsum_default_582,call_function,einsum.default,backward,0,2,2,1,6032,89,5
+7089,add_332,call_function,add.Tensor,unknown,,2,2,1,6037,88,10
+7098,mul_751,call_function,mul.Tensor,backward,0,2,2,1,84,86,8
+7097,mul_750,call_function,mul.Tensor,backward,0,2,2,1,6041,80,8
+7101,mul_752,call_function,mul.Tensor,backward,0,2,2,1,6046,78,8
+7104,mul_753,call_function,mul.Tensor,backward,0,2,2,1,6049,76,8
+7105,sub_83,call_function,sub.Tensor,backward,0,2,2,1,6050,75,10
+7106,mul_754,call_function,mul.Tensor,backward,0,2,2,1,6051,74,8
+7111,add_333,call_function,add.Tensor,unknown,,2,2,1,6053,72,10
+7117,einsum_default_584,call_function,einsum.default,backward,0,2,2,1,6056,66,5
+7136,mul_756,call_function,mul.Tensor,backward,0,2,2,1,6070,32,8
+7141,mul_757,call_function,mul.Tensor,backward,0,2,2,1,6070,31,8
+7154,einsum_default_586,call_function,einsum.default,backward,0,2,2,1,6068,22,5
+7161,einsum_default_588,call_function,einsum.default,backward,0,2,2,1,6077,22,5
+7162,add_334,call_function,add.Tensor,unknown,,2,2,1,6084,21,10
+7169,einsum_default_590,call_function,einsum.default,backward,0,2,2,1,6077,21,5
+7170,add_335,call_function,add.Tensor,unknown,,2,2,1,6100,20,10
+7179,mul_759,call_function,mul.Tensor,backward,0,2,2,1,15,18,8
+7178,mul_758,call_function,mul.Tensor,backward,0,2,2,1,6104,12,8
+3183,mul_196,call_function,mul.Tensor,forward,,2,2,1,3096,10,8
+7182,mul_760,call_function,mul.Tensor,backward,0,2,2,1,6109,10,8
+3185,mul_197,call_function,mul.Tensor,forward,,2,2,1,3100,9,8
+7185,mul_761,call_function,mul.Tensor,backward,0,2,2,1,6112,8,8
+7186,sub_84,call_function,sub.Tensor,backward,0,2,2,1,6113,7,10
+7187,mul_762,call_function,mul.Tensor,backward,0,2,2,1,6114,6,8
+3194,einsum_default_197,call_function,einsum.default,backward,,2,2,1,3105,4,5
+3213,mul_203,call_function,mul.Tensor,backward,,2,2,1,3108,4,8
+3273,mul_215,call_function,mul.Tensor,backward,27,2,2,1,3154,4,8
+3354,mul_223,call_function,mul.Tensor,backward,27,2,2,1,3217,4,8
+3415,mul_235,call_function,mul.Tensor,backward,26,2,2,1,3261,4,8
+3496,mul_243,call_function,mul.Tensor,backward,26,2,2,1,3324,4,8
+3557,mul_255,call_function,mul.Tensor,backward,25,2,2,1,3368,4,8
+3638,mul_263,call_function,mul.Tensor,backward,25,2,2,1,3431,4,8
+3699,mul_275,call_function,mul.Tensor,backward,24,2,2,1,3475,4,8
+3780,mul_283,call_function,mul.Tensor,backward,24,2,2,1,3538,4,8
+3841,mul_295,call_function,mul.Tensor,backward,23,2,2,1,3582,4,8
+3922,mul_303,call_function,mul.Tensor,backward,23,2,2,1,3645,4,8
+3983,mul_315,call_function,mul.Tensor,backward,22,2,2,1,3689,4,8
+4064,mul_323,call_function,mul.Tensor,backward,22,2,2,1,3752,4,8
+4125,mul_335,call_function,mul.Tensor,backward,21,2,2,1,3796,4,8
+4206,mul_343,call_function,mul.Tensor,backward,21,2,2,1,3859,4,8
+4267,mul_355,call_function,mul.Tensor,backward,20,2,2,1,3903,4,8
+4348,mul_363,call_function,mul.Tensor,backward,20,2,2,1,3966,4,8
+4409,mul_375,call_function,mul.Tensor,backward,19,2,2,1,4010,4,8
+4490,mul_383,call_function,mul.Tensor,backward,19,2,2,1,4073,4,8
+4551,mul_395,call_function,mul.Tensor,backward,18,2,2,1,4117,4,8
+4632,mul_403,call_function,mul.Tensor,backward,18,2,2,1,4180,4,8
+4693,mul_415,call_function,mul.Tensor,backward,17,2,2,1,4224,4,8
+4774,mul_423,call_function,mul.Tensor,backward,17,2,2,1,4287,4,8
+4835,mul_435,call_function,mul.Tensor,backward,16,2,2,1,4331,4,8
+4916,mul_443,call_function,mul.Tensor,backward,16,2,2,1,4394,4,8
+4977,mul_455,call_function,mul.Tensor,backward,15,2,2,1,4438,4,8
+5058,mul_463,call_function,mul.Tensor,backward,15,2,2,1,4501,4,8
+5119,mul_475,call_function,mul.Tensor,backward,14,2,2,1,4545,4,8
+5200,mul_483,call_function,mul.Tensor,backward,14,2,2,1,4608,4,8
+5261,mul_495,call_function,mul.Tensor,backward,13,2,2,1,4652,4,8
+5342,mul_503,call_function,mul.Tensor,backward,13,2,2,1,4715,4,8
+5403,mul_515,call_function,mul.Tensor,backward,12,2,2,1,4759,4,8
+5484,mul_523,call_function,mul.Tensor,backward,12,2,2,1,4822,4,8
+5545,mul_535,call_function,mul.Tensor,backward,11,2,2,1,4866,4,8
+5626,mul_543,call_function,mul.Tensor,backward,11,2,2,1,4929,4,8
+5687,mul_555,call_function,mul.Tensor,backward,10,2,2,1,4973,4,8
+5768,mul_563,call_function,mul.Tensor,backward,10,2,2,1,5036,4,8
+5829,mul_575,call_function,mul.Tensor,backward,9,2,2,1,5080,4,8
+5910,mul_583,call_function,mul.Tensor,backward,9,2,2,1,5143,4,8
+5971,mul_595,call_function,mul.Tensor,backward,8,2,2,1,5187,4,8
+6052,mul_603,call_function,mul.Tensor,backward,8,2,2,1,5250,4,8
+6113,mul_615,call_function,mul.Tensor,backward,7,2,2,1,5294,4,8
+6194,mul_623,call_function,mul.Tensor,backward,7,2,2,1,5357,4,8
+6255,mul_635,call_function,mul.Tensor,backward,6,2,2,1,5401,4,8
+6336,mul_643,call_function,mul.Tensor,backward,6,2,2,1,5464,4,8
+6397,mul_655,call_function,mul.Tensor,backward,5,2,2,1,5508,4,8
+6478,mul_663,call_function,mul.Tensor,backward,5,2,2,1,5571,4,8
+6539,mul_675,call_function,mul.Tensor,backward,4,2,2,1,5615,4,8
+6620,mul_683,call_function,mul.Tensor,backward,4,2,2,1,5678,4,8
+6681,mul_695,call_function,mul.Tensor,backward,3,2,2,1,5722,4,8
+6762,mul_703,call_function,mul.Tensor,backward,3,2,2,1,5785,4,8
+6823,mul_715,call_function,mul.Tensor,backward,2,2,2,1,5829,4,8
+6904,mul_723,call_function,mul.Tensor,backward,2,2,2,1,5892,4,8
+6965,mul_735,call_function,mul.Tensor,backward,1,2,2,1,5936,4,8
+7046,mul_743,call_function,mul.Tensor,backward,1,2,2,1,5999,4,8
+7107,mul_755,call_function,mul.Tensor,backward,0,2,2,1,6043,4,8
+7188,mul_763,call_function,mul.Tensor,backward,0,2,2,1,6106,4,8
+7192,add_336,call_function,add.Tensor,unknown,,2,2,1,6116,4,10
+3220,einsum_default_199,call_function,einsum.default,backward,27,2,2,1,3122,3,5
+3230,einsum_default_201,call_function,einsum.default,backward,27,2,2,1,3127,3,5
+3252,einsum_default_203,call_function,einsum.default,backward,27,2,2,1,3142,3,5
+3281,einsum_default_205,call_function,einsum.default,backward,27,2,2,1,3166,3,5
+3318,einsum_default_207,call_function,einsum.default,backward,27,2,2,1,3178,3,5
+3325,einsum_default_209,call_function,einsum.default,backward,27,2,2,1,3187,3,5
+3333,einsum_default_211,call_function,einsum.default,backward,27,2,2,1,3187,3,5
+3362,einsum_default_213,call_function,einsum.default,backward,26,2,2,1,3229,3,5
+3372,einsum_default_215,call_function,einsum.default,backward,26,2,2,1,3234,3,5
+3394,einsum_default_217,call_function,einsum.default,backward,26,2,2,1,3249,3,5
+3423,einsum_default_219,call_function,einsum.default,backward,26,2,2,1,3273,3,5
+3460,einsum_default_221,call_function,einsum.default,backward,26,2,2,1,3285,3,5
+3467,einsum_default_223,call_function,einsum.default,backward,26,2,2,1,3294,3,5
+3475,einsum_default_225,call_function,einsum.default,backward,26,2,2,1,3294,3,5
+3504,einsum_default_227,call_function,einsum.default,backward,25,2,2,1,3336,3,5
+3514,einsum_default_229,call_function,einsum.default,backward,25,2,2,1,3341,3,5
+3536,einsum_default_231,call_function,einsum.default,backward,25,2,2,1,3356,3,5
+3565,einsum_default_233,call_function,einsum.default,backward,25,2,2,1,3380,3,5
+3602,einsum_default_235,call_function,einsum.default,backward,25,2,2,1,3392,3,5
+3609,einsum_default_237,call_function,einsum.default,backward,25,2,2,1,3401,3,5
+3617,einsum_default_239,call_function,einsum.default,backward,25,2,2,1,3401,3,5
+3646,einsum_default_241,call_function,einsum.default,backward,24,2,2,1,3443,3,5
+3656,einsum_default_243,call_function,einsum.default,backward,24,2,2,1,3448,3,5
+3678,einsum_default_245,call_function,einsum.default,backward,24,2,2,1,3463,3,5
+3707,einsum_default_247,call_function,einsum.default,backward,24,2,2,1,3487,3,5
+3744,einsum_default_249,call_function,einsum.default,backward,24,2,2,1,3499,3,5
+3751,einsum_default_251,call_function,einsum.default,backward,24,2,2,1,3508,3,5
+3759,einsum_default_253,call_function,einsum.default,backward,24,2,2,1,3508,3,5
+3788,einsum_default_255,call_function,einsum.default,backward,23,2,2,1,3550,3,5
+3798,einsum_default_257,call_function,einsum.default,backward,23,2,2,1,3555,3,5
+3820,einsum_default_259,call_function,einsum.default,backward,23,2,2,1,3570,3,5
+3849,einsum_default_261,call_function,einsum.default,backward,23,2,2,1,3594,3,5
+3886,einsum_default_263,call_function,einsum.default,backward,23,2,2,1,3606,3,5
+3893,einsum_default_265,call_function,einsum.default,backward,23,2,2,1,3615,3,5
+3901,einsum_default_267,call_function,einsum.default,backward,23,2,2,1,3615,3,5
+3930,einsum_default_269,call_function,einsum.default,backward,22,2,2,1,3657,3,5
+3940,einsum_default_271,call_function,einsum.default,backward,22,2,2,1,3662,3,5
+3962,einsum_default_273,call_function,einsum.default,backward,22,2,2,1,3677,3,5
+3991,einsum_default_275,call_function,einsum.default,backward,22,2,2,1,3701,3,5
+4028,einsum_default_277,call_function,einsum.default,backward,22,2,2,1,3713,3,5
+4035,einsum_default_279,call_function,einsum.default,backward,22,2,2,1,3722,3,5
+4043,einsum_default_281,call_function,einsum.default,backward,22,2,2,1,3722,3,5
+4072,einsum_default_283,call_function,einsum.default,backward,21,2,2,1,3764,3,5
+4082,einsum_default_285,call_function,einsum.default,backward,21,2,2,1,3769,3,5
+4104,einsum_default_287,call_function,einsum.default,backward,21,2,2,1,3784,3,5
+4133,einsum_default_289,call_function,einsum.default,backward,21,2,2,1,3808,3,5
+4170,einsum_default_291,call_function,einsum.default,backward,21,2,2,1,3820,3,5
+4177,einsum_default_293,call_function,einsum.default,backward,21,2,2,1,3829,3,5
+4185,einsum_default_295,call_function,einsum.default,backward,21,2,2,1,3829,3,5
+4214,einsum_default_297,call_function,einsum.default,backward,20,2,2,1,3871,3,5
+4224,einsum_default_299,call_function,einsum.default,backward,20,2,2,1,3876,3,5
+4246,einsum_default_301,call_function,einsum.default,backward,20,2,2,1,3891,3,5
+4275,einsum_default_303,call_function,einsum.default,backward,20,2,2,1,3915,3,5
+4312,einsum_default_305,call_function,einsum.default,backward,20,2,2,1,3927,3,5
+4319,einsum_default_307,call_function,einsum.default,backward,20,2,2,1,3936,3,5
+4327,einsum_default_309,call_function,einsum.default,backward,20,2,2,1,3936,3,5
+4356,einsum_default_311,call_function,einsum.default,backward,19,2,2,1,3978,3,5
+4366,einsum_default_313,call_function,einsum.default,backward,19,2,2,1,3983,3,5
+4388,einsum_default_315,call_function,einsum.default,backward,19,2,2,1,3998,3,5
+4417,einsum_default_317,call_function,einsum.default,backward,19,2,2,1,4022,3,5
+4454,einsum_default_319,call_function,einsum.default,backward,19,2,2,1,4034,3,5
+4461,einsum_default_321,call_function,einsum.default,backward,19,2,2,1,4043,3,5
+4469,einsum_default_323,call_function,einsum.default,backward,19,2,2,1,4043,3,5
+4498,einsum_default_325,call_function,einsum.default,backward,18,2,2,1,4085,3,5
+4508,einsum_default_327,call_function,einsum.default,backward,18,2,2,1,4090,3,5
+4530,einsum_default_329,call_function,einsum.default,backward,18,2,2,1,4105,3,5
+4559,einsum_default_331,call_function,einsum.default,backward,18,2,2,1,4129,3,5
+4596,einsum_default_333,call_function,einsum.default,backward,18,2,2,1,4141,3,5
+4603,einsum_default_335,call_function,einsum.default,backward,18,2,2,1,4150,3,5
+4611,einsum_default_337,call_function,einsum.default,backward,18,2,2,1,4150,3,5
+4640,einsum_default_339,call_function,einsum.default,backward,17,2,2,1,4192,3,5
+4650,einsum_default_341,call_function,einsum.default,backward,17,2,2,1,4197,3,5
+4672,einsum_default_343,call_function,einsum.default,backward,17,2,2,1,4212,3,5
+4701,einsum_default_345,call_function,einsum.default,backward,17,2,2,1,4236,3,5
+4738,einsum_default_347,call_function,einsum.default,backward,17,2,2,1,4248,3,5
+4745,einsum_default_349,call_function,einsum.default,backward,17,2,2,1,4257,3,5
+4753,einsum_default_351,call_function,einsum.default,backward,17,2,2,1,4257,3,5
+4782,einsum_default_353,call_function,einsum.default,backward,16,2,2,1,4299,3,5
+4792,einsum_default_355,call_function,einsum.default,backward,16,2,2,1,4304,3,5
+4814,einsum_default_357,call_function,einsum.default,backward,16,2,2,1,4319,3,5
+4843,einsum_default_359,call_function,einsum.default,backward,16,2,2,1,4343,3,5
+4880,einsum_default_361,call_function,einsum.default,backward,16,2,2,1,4355,3,5
+4887,einsum_default_363,call_function,einsum.default,backward,16,2,2,1,4364,3,5
+4895,einsum_default_365,call_function,einsum.default,backward,16,2,2,1,4364,3,5
+4924,einsum_default_367,call_function,einsum.default,backward,15,2,2,1,4406,3,5
+4934,einsum_default_369,call_function,einsum.default,backward,15,2,2,1,4411,3,5
+4956,einsum_default_371,call_function,einsum.default,backward,15,2,2,1,4426,3,5
+4985,einsum_default_373,call_function,einsum.default,backward,15,2,2,1,4450,3,5
+5022,einsum_default_375,call_function,einsum.default,backward,15,2,2,1,4462,3,5
+5029,einsum_default_377,call_function,einsum.default,backward,15,2,2,1,4471,3,5
+5037,einsum_default_379,call_function,einsum.default,backward,15,2,2,1,4471,3,5
+5066,einsum_default_381,call_function,einsum.default,backward,14,2,2,1,4513,3,5
+5076,einsum_default_383,call_function,einsum.default,backward,14,2,2,1,4518,3,5
+5098,einsum_default_385,call_function,einsum.default,backward,14,2,2,1,4533,3,5
+5127,einsum_default_387,call_function,einsum.default,backward,14,2,2,1,4557,3,5
+5164,einsum_default_389,call_function,einsum.default,backward,14,2,2,1,4569,3,5
+5171,einsum_default_391,call_function,einsum.default,backward,14,2,2,1,4578,3,5
+5179,einsum_default_393,call_function,einsum.default,backward,14,2,2,1,4578,3,5
+5208,einsum_default_395,call_function,einsum.default,backward,13,2,2,1,4620,3,5
+5218,einsum_default_397,call_function,einsum.default,backward,13,2,2,1,4625,3,5
+5240,einsum_default_399,call_function,einsum.default,backward,13,2,2,1,4640,3,5
+5269,einsum_default_401,call_function,einsum.default,backward,13,2,2,1,4664,3,5
+5306,einsum_default_403,call_function,einsum.default,backward,13,2,2,1,4676,3,5
+5313,einsum_default_405,call_function,einsum.default,backward,13,2,2,1,4685,3,5
+5321,einsum_default_407,call_function,einsum.default,backward,13,2,2,1,4685,3,5
+5350,einsum_default_409,call_function,einsum.default,backward,12,2,2,1,4727,3,5
+5360,einsum_default_411,call_function,einsum.default,backward,12,2,2,1,4732,3,5
+5382,einsum_default_413,call_function,einsum.default,backward,12,2,2,1,4747,3,5
+5411,einsum_default_415,call_function,einsum.default,backward,12,2,2,1,4771,3,5
+5448,einsum_default_417,call_function,einsum.default,backward,12,2,2,1,4783,3,5
+5455,einsum_default_419,call_function,einsum.default,backward,12,2,2,1,4792,3,5
+5463,einsum_default_421,call_function,einsum.default,backward,12,2,2,1,4792,3,5
+5492,einsum_default_423,call_function,einsum.default,backward,11,2,2,1,4834,3,5
+5502,einsum_default_425,call_function,einsum.default,backward,11,2,2,1,4839,3,5
+5524,einsum_default_427,call_function,einsum.default,backward,11,2,2,1,4854,3,5
+5553,einsum_default_429,call_function,einsum.default,backward,11,2,2,1,4878,3,5
+5590,einsum_default_431,call_function,einsum.default,backward,11,2,2,1,4890,3,5
+5597,einsum_default_433,call_function,einsum.default,backward,11,2,2,1,4899,3,5
+5605,einsum_default_435,call_function,einsum.default,backward,11,2,2,1,4899,3,5
+5634,einsum_default_437,call_function,einsum.default,backward,10,2,2,1,4941,3,5
+5644,einsum_default_439,call_function,einsum.default,backward,10,2,2,1,4946,3,5
+5666,einsum_default_441,call_function,einsum.default,backward,10,2,2,1,4961,3,5
+5695,einsum_default_443,call_function,einsum.default,backward,10,2,2,1,4985,3,5
+5732,einsum_default_445,call_function,einsum.default,backward,10,2,2,1,4997,3,5
+5739,einsum_default_447,call_function,einsum.default,backward,10,2,2,1,5006,3,5
+5747,einsum_default_449,call_function,einsum.default,backward,10,2,2,1,5006,3,5
+5776,einsum_default_451,call_function,einsum.default,backward,9,2,2,1,5048,3,5
+5786,einsum_default_453,call_function,einsum.default,backward,9,2,2,1,5053,3,5
+5808,einsum_default_455,call_function,einsum.default,backward,9,2,2,1,5068,3,5
+5837,einsum_default_457,call_function,einsum.default,backward,9,2,2,1,5092,3,5
+5874,einsum_default_459,call_function,einsum.default,backward,9,2,2,1,5104,3,5
+5881,einsum_default_461,call_function,einsum.default,backward,9,2,2,1,5113,3,5
+5889,einsum_default_463,call_function,einsum.default,backward,9,2,2,1,5113,3,5
+5918,einsum_default_465,call_function,einsum.default,backward,8,2,2,1,5155,3,5
+5928,einsum_default_467,call_function,einsum.default,backward,8,2,2,1,5160,3,5
+5950,einsum_default_469,call_function,einsum.default,backward,8,2,2,1,5175,3,5
+5979,einsum_default_471,call_function,einsum.default,backward,8,2,2,1,5199,3,5
+6016,einsum_default_473,call_function,einsum.default,backward,8,2,2,1,5211,3,5
+6023,einsum_default_475,call_function,einsum.default,backward,8,2,2,1,5220,3,5
+6031,einsum_default_477,call_function,einsum.default,backward,8,2,2,1,5220,3,5
+6060,einsum_default_479,call_function,einsum.default,backward,7,2,2,1,5262,3,5
+6070,einsum_default_481,call_function,einsum.default,backward,7,2,2,1,5267,3,5
+6092,einsum_default_483,call_function,einsum.default,backward,7,2,2,1,5282,3,5
+6121,einsum_default_485,call_function,einsum.default,backward,7,2,2,1,5306,3,5
+6158,einsum_default_487,call_function,einsum.default,backward,7,2,2,1,5318,3,5
+6165,einsum_default_489,call_function,einsum.default,backward,7,2,2,1,5327,3,5
+6173,einsum_default_491,call_function,einsum.default,backward,7,2,2,1,5327,3,5
+6202,einsum_default_493,call_function,einsum.default,backward,6,2,2,1,5369,3,5
+6212,einsum_default_495,call_function,einsum.default,backward,6,2,2,1,5374,3,5
+6234,einsum_default_497,call_function,einsum.default,backward,6,2,2,1,5389,3,5
+6263,einsum_default_499,call_function,einsum.default,backward,6,2,2,1,5413,3,5
+6300,einsum_default_501,call_function,einsum.default,backward,6,2,2,1,5425,3,5
+6307,einsum_default_503,call_function,einsum.default,backward,6,2,2,1,5434,3,5
+6315,einsum_default_505,call_function,einsum.default,backward,6,2,2,1,5434,3,5
+6344,einsum_default_507,call_function,einsum.default,backward,5,2,2,1,5476,3,5
+6354,einsum_default_509,call_function,einsum.default,backward,5,2,2,1,5481,3,5
+6376,einsum_default_511,call_function,einsum.default,backward,5,2,2,1,5496,3,5
+6405,einsum_default_513,call_function,einsum.default,backward,5,2,2,1,5520,3,5
+6442,einsum_default_515,call_function,einsum.default,backward,5,2,2,1,5532,3,5
+6449,einsum_default_517,call_function,einsum.default,backward,5,2,2,1,5541,3,5
+6457,einsum_default_519,call_function,einsum.default,backward,5,2,2,1,5541,3,5
+6486,einsum_default_521,call_function,einsum.default,backward,4,2,2,1,5583,3,5
+6496,einsum_default_523,call_function,einsum.default,backward,4,2,2,1,5588,3,5
+6518,einsum_default_525,call_function,einsum.default,backward,4,2,2,1,5603,3,5
+6547,einsum_default_527,call_function,einsum.default,backward,4,2,2,1,5627,3,5
+6584,einsum_default_529,call_function,einsum.default,backward,4,2,2,1,5639,3,5
+6591,einsum_default_531,call_function,einsum.default,backward,4,2,2,1,5648,3,5
+6599,einsum_default_533,call_function,einsum.default,backward,4,2,2,1,5648,3,5
+6628,einsum_default_535,call_function,einsum.default,backward,3,2,2,1,5690,3,5
+6638,einsum_default_537,call_function,einsum.default,backward,3,2,2,1,5695,3,5
+6660,einsum_default_539,call_function,einsum.default,backward,3,2,2,1,5710,3,5
+6689,einsum_default_541,call_function,einsum.default,backward,3,2,2,1,5734,3,5
+6726,einsum_default_543,call_function,einsum.default,backward,3,2,2,1,5746,3,5
+6733,einsum_default_545,call_function,einsum.default,backward,3,2,2,1,5755,3,5
+6741,einsum_default_547,call_function,einsum.default,backward,3,2,2,1,5755,3,5
+6770,einsum_default_549,call_function,einsum.default,backward,2,2,2,1,5797,3,5
+6780,einsum_default_551,call_function,einsum.default,backward,2,2,2,1,5802,3,5
+6802,einsum_default_553,call_function,einsum.default,backward,2,2,2,1,5817,3,5
+6831,einsum_default_555,call_function,einsum.default,backward,2,2,2,1,5841,3,5
+6868,einsum_default_557,call_function,einsum.default,backward,2,2,2,1,5853,3,5
+6875,einsum_default_559,call_function,einsum.default,backward,2,2,2,1,5862,3,5
+6883,einsum_default_561,call_function,einsum.default,backward,2,2,2,1,5862,3,5
+6912,einsum_default_563,call_function,einsum.default,backward,1,2,2,1,5904,3,5
+6922,einsum_default_565,call_function,einsum.default,backward,1,2,2,1,5909,3,5
+6944,einsum_default_567,call_function,einsum.default,backward,1,2,2,1,5924,3,5
+6973,einsum_default_569,call_function,einsum.default,backward,1,2,2,1,5948,3,5
+7010,einsum_default_571,call_function,einsum.default,backward,1,2,2,1,5960,3,5
+7017,einsum_default_573,call_function,einsum.default,backward,1,2,2,1,5969,3,5
+7025,einsum_default_575,call_function,einsum.default,backward,1,2,2,1,5969,3,5
+7054,einsum_default_577,call_function,einsum.default,backward,0,2,2,1,6011,3,5
+7064,einsum_default_579,call_function,einsum.default,backward,0,2,2,1,6016,3,5
+7086,einsum_default_581,call_function,einsum.default,backward,0,2,2,1,6031,3,5
+7115,einsum_default_583,call_function,einsum.default,backward,0,2,2,1,6055,3,5
+7152,einsum_default_585,call_function,einsum.default,backward,0,2,2,1,6067,3,5
+7159,einsum_default_587,call_function,einsum.default,backward,0,2,2,1,6076,3,5
+7167,einsum_default_589,call_function,einsum.default,backward,0,2,2,1,6076,3,5
+7195,embedding_dense_backward,call_function,embedding_dense_backward.default,backward,,2,2,1,6117,3,5
+3191,einsum_default_196,call_function,einsum.default,forward,,2,2,1,3106,1,5
+7197,add_337,call_function,add.Tensor,unknown,,2,2,1,6126,1,9
diff --git a/profile_results/real_llama3_by_mesh_dim.svg b/profile_results/real_llama3_by_mesh_dim.svg
new file mode 100644
index 00000000..6eb41508
--- /dev/null
+++ b/profile_results/real_llama3_by_mesh_dim.svg
@@ -0,0 +1,167 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1600" height="1000" viewBox="0 0 1600 1000">
+<rect width="100%" height="100%" fill="white"/>
+<text x="32" y="34" font-family="sans-serif" font-size="22" font-weight="700">Real Llama3 optimizer profile vs mesh dimension</text>
+<text x="32" y="58" font-family="sans-serif" font-size="12" fill="#475569">Y axes are log scale. Missing series points timed out or were not run.</text>
+<line x1="32" y1="84" x2="60" y2="84" stroke="#2563eb" stroke-width="3"/>
+<text x="68" y="88" font-family="sans-serif" font-size="12" fill="#334155">model_key=1B</text>
+<line x1="212" y1="84" x2="240" y2="84" stroke="#dc2626" stroke-width="3"/>
+<text x="248" y="88" font-family="sans-serif" font-size="12" fill="#334155">model_key=3B</text>
+<line x1="392" y1="84" x2="420" y2="84" stroke="#16a34a" stroke-width="3"/>
+<text x="428" y="88" font-family="sans-serif" font-size="12" fill="#334155">model_key=405B</text>
+<line x1="572" y1="84" x2="600" y2="84" stroke="#9333ea" stroke-width="3"/>
+<text x="608" y="88" font-family="sans-serif" font-size="12" fill="#334155">model_key=70B</text>
+<line x1="752" y1="84" x2="780" y2="84" stroke="#ea580c" stroke-width="3"/>
+<text x="788" y="88" font-family="sans-serif" font-size="12" fill="#334155">model_key=8B</text>
+<text x="62" y="106" font-family="sans-serif" font-size="14" font-weight="700">strategy enum (s)</text>
+<rect x="62" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="62" y1="300" x2="422" y2="300" stroke="#64748b"/>
+<line x1="62" y1="120" x2="62" y2="300" stroke="#64748b"/>
+<text x="12" y="132" font-family="sans-serif" font-size="10" fill="#64748b">9.9</text>
+<text x="12" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.54</text>
+<text x="46.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
+<text x="406.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
+<polyline points="62.0,286.0 422.0,130.3" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="62.0" cy="286.0" r="3.5" fill="#2563eb"/>
+<circle cx="422.0" cy="130.3" r="3.5" fill="#2563eb"/>
+<polyline points="62.0,300.0 422.0,120.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="62.0" cy="300.0" r="3.5" fill="#dc2626"/>
+<circle cx="422.0" cy="120.0" r="3.5" fill="#dc2626"/>
+<circle cx="62.0" cy="203.6" r="3.5" fill="#16a34a"/>
+<circle cx="62.0" cy="187.9" r="3.5" fill="#9333ea"/>
+<polyline points="62.0,265.4 422.0,124.3" fill="none" stroke="#ea580c" stroke-width="2.4"/>
+<circle cx="62.0" cy="265.4" r="3.5" fill="#ea580c"/>
+<circle cx="422.0" cy="124.3" r="3.5" fill="#ea580c"/>
+<text x="452" y="106" font-family="sans-serif" font-size="14" font-weight="700">cost estimation (s)</text>
+<rect x="452" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="452" y1="300" x2="812" y2="300" stroke="#64748b"/>
+<line x1="452" y1="120" x2="452" y2="300" stroke="#64748b"/>
+<text x="402" y="132" font-family="sans-serif" font-size="10" fill="#64748b">5</text>
+<text x="402" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.54</text>
+<text x="436.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
+<text x="796.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
+<polyline points="452.0,300.0 812.0,124.3" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="452.0" cy="300.0" r="3.5" fill="#2563eb"/>
+<circle cx="812.0" cy="124.3" r="3.5" fill="#2563eb"/>
+<polyline points="452.0,298.4 812.0,120.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="452.0" cy="298.4" r="3.5" fill="#dc2626"/>
+<circle cx="812.0" cy="120.0" r="3.5" fill="#dc2626"/>
+<circle cx="452.0" cy="299.1" r="3.5" fill="#16a34a"/>
+<circle cx="452.0" cy="281.4" r="3.5" fill="#9333ea"/>
+<polyline points="452.0,297.8 812.0,124.5" fill="none" stroke="#ea580c" stroke-width="2.4"/>
+<circle cx="452.0" cy="297.8" r="3.5" fill="#ea580c"/>
+<circle cx="812.0" cy="124.5" r="3.5" fill="#ea580c"/>
+<text x="842" y="106" font-family="sans-serif" font-size="14" font-weight="700">ILP construction (s)</text>
+<rect x="842" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="842" y1="300" x2="1202" y2="300" stroke="#64748b"/>
+<line x1="842" y1="120" x2="842" y2="300" stroke="#64748b"/>
+<text x="792" y="132" font-family="sans-serif" font-size="10" fill="#64748b">14</text>
+<text x="792" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.26</text>
+<text x="826.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
+<text x="1186.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
+<polyline points="842.0,300.0 1202.0,134.3" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="842.0" cy="300.0" r="3.5" fill="#2563eb"/>
+<circle cx="1202.0" cy="134.3" r="3.5" fill="#2563eb"/>
+<polyline points="842.0,292.0 1202.0,120.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="842.0" cy="292.0" r="3.5" fill="#dc2626"/>
+<circle cx="1202.0" cy="120.0" r="3.5" fill="#dc2626"/>
+<circle cx="842.0" cy="208.5" r="3.5" fill="#16a34a"/>
+<circle cx="842.0" cy="203.9" r="3.5" fill="#9333ea"/>
+<polyline points="842.0,254.2 1202.0,129.6" fill="none" stroke="#ea580c" stroke-width="2.4"/>
+<circle cx="842.0" cy="254.2" r="3.5" fill="#ea580c"/>
+<circle cx="1202.0" cy="129.6" r="3.5" fill="#ea580c"/>
+<text x="1232" y="106" font-family="sans-serif" font-size="14" font-weight="700">objective build (s)</text>
+<rect x="1232" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="1232" y1="300" x2="1592" y2="300" stroke="#64748b"/>
+<line x1="1232" y1="120" x2="1232" y2="300" stroke="#64748b"/>
+<text x="1182" y="132" font-family="sans-serif" font-size="10" fill="#64748b">3.3</text>
+<text x="1182" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.053</text>
+<text x="1216.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
+<text x="1576.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
+<polyline points="1232.0,300.0 1592.0,138.1" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="1232.0" cy="300.0" r="3.5" fill="#2563eb"/>
+<circle cx="1592.0" cy="138.1" r="3.5" fill="#2563eb"/>
+<polyline points="1232.0,295.0 1592.0,133.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="1232.0" cy="295.0" r="3.5" fill="#dc2626"/>
+<circle cx="1592.0" cy="133.0" r="3.5" fill="#dc2626"/>
+<circle cx="1232.0" cy="288.4" r="3.5" fill="#16a34a"/>
+<circle cx="1232.0" cy="286.3" r="3.5" fill="#9333ea"/>
+<polyline points="1232.0,295.7 1592.0,120.0" fill="none" stroke="#ea580c" stroke-width="2.4"/>
+<circle cx="1232.0" cy="295.7" r="3.5" fill="#ea580c"/>
+<circle cx="1592.0" cy="120.0" r="3.5" fill="#ea580c"/>
+<text x="62" y="336" font-family="sans-serif" font-size="14" font-weight="700">solve (s)</text>
+<rect x="62" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="62" y1="530" x2="422" y2="530" stroke="#64748b"/>
+<line x1="62" y1="350" x2="62" y2="530" stroke="#64748b"/>
+<text x="12" y="362" font-family="sans-serif" font-size="10" fill="#64748b">86</text>
+<text x="12" y="530" font-family="sans-serif" font-size="10" fill="#64748b">0.49</text>
+<text x="46.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
+<text x="406.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
+<polyline points="62.0,530.0 422.0,352.4" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="62.0" cy="530.0" r="3.5" fill="#2563eb"/>
+<circle cx="422.0" cy="352.4" r="3.5" fill="#2563eb"/>
+<polyline points="62.0,523.9 422.0,353.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="62.0" cy="523.9" r="3.5" fill="#dc2626"/>
+<circle cx="422.0" cy="353.0" r="3.5" fill="#dc2626"/>
+<circle cx="62.0" cy="472.4" r="3.5" fill="#16a34a"/>
+<circle cx="62.0" cy="490.5" r="3.5" fill="#9333ea"/>
+<polyline points="62.0,523.9 422.0,350.0" fill="none" stroke="#ea580c" stroke-width="2.4"/>
+<circle cx="62.0" cy="523.9" r="3.5" fill="#ea580c"/>
+<circle cx="422.0" cy="350.0" r="3.5" fill="#ea580c"/>
+<text x="452" y="336" font-family="sans-serif" font-size="14" font-weight="700">pipeline total (s)</text>
+<rect x="452" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="452" y1="530" x2="812" y2="530" stroke="#64748b"/>
+<line x1="452" y1="350" x2="452" y2="530" stroke="#64748b"/>
+<text x="402" y="362" font-family="sans-serif" font-size="10" fill="#64748b">124</text>
+<text x="402" y="530" font-family="sans-serif" font-size="10" fill="#64748b">3</text>
+<text x="436.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
+<text x="796.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
+<polyline points="452.0,530.0 812.0,356.0" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="452.0" cy="530.0" r="3.5" fill="#2563eb"/>
+<circle cx="812.0" cy="356.0" r="3.5" fill="#2563eb"/>
+<polyline points="452.0,514.5 812.0,352.1" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="452.0" cy="514.5" r="3.5" fill="#dc2626"/>
+<circle cx="812.0" cy="352.1" r="3.5" fill="#dc2626"/>
+<circle cx="452.0" cy="419.0" r="3.5" fill="#16a34a"/>
+<circle cx="452.0" cy="438.3" r="3.5" fill="#9333ea"/>
+<polyline points="452.0,508.1 812.0,350.0" fill="none" stroke="#ea580c" stroke-width="2.4"/>
+<circle cx="452.0" cy="508.1" r="3.5" fill="#ea580c"/>
+<circle cx="812.0" cy="350.0" r="3.5" fill="#ea580c"/>
+<text x="842" y="336" font-family="sans-serif" font-size="14" font-weight="700">unique ILP vars</text>
+<rect x="842" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="842" y1="530" x2="1202" y2="530" stroke="#64748b"/>
+<line x1="842" y1="350" x2="842" y2="530" stroke="#64748b"/>
+<text x="792" y="362" font-family="sans-serif" font-size="10" fill="#64748b">488.5K</text>
+<text x="792" y="530" font-family="sans-serif" font-size="10" fill="#64748b">13.0K</text>
+<text x="826.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
+<text x="1186.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
+<polyline points="842.0,530.0 1202.0,350.6" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="842.0" cy="530.0" r="3.5" fill="#2563eb"/>
+<circle cx="1202.0" cy="350.6" r="3.5" fill="#2563eb"/>
+<polyline points="842.0,526.8 1202.0,350.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="842.0" cy="526.8" r="3.5" fill="#dc2626"/>
+<circle cx="1202.0" cy="350.0" r="3.5" fill="#dc2626"/>
+<circle cx="842.0" cy="515.7" r="3.5" fill="#16a34a"/>
+<circle cx="842.0" cy="520.2" r="3.5" fill="#9333ea"/>
+<polyline points="842.0,526.6 1202.0,350.1" fill="none" stroke="#ea580c" stroke-width="2.4"/>
+<circle cx="842.0" cy="526.6" r="3.5" fill="#ea580c"/>
+<circle cx="1202.0" cy="350.1" r="3.5" fill="#ea580c"/>
+<text x="1232" y="336" font-family="sans-serif" font-size="14" font-weight="700">constraints</text>
+<rect x="1232" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="1232" y1="530" x2="1592" y2="530" stroke="#64748b"/>
+<line x1="1232" y1="350" x2="1232" y2="530" stroke="#64748b"/>
+<text x="1182" y="362" font-family="sans-serif" font-size="10" fill="#64748b">177.2K</text>
+<text x="1182" y="530" font-family="sans-serif" font-size="10" fill="#64748b">7.0K</text>
+<text x="1216.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
+<text x="1576.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
+<polyline points="1232.0,530.0 1592.0,351.3" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="1232.0" cy="530.0" r="3.5" fill="#2563eb"/>
+<circle cx="1592.0" cy="351.3" r="3.5" fill="#2563eb"/>
+<polyline points="1232.0,522.3 1592.0,350.2" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="1232.0" cy="522.3" r="3.5" fill="#dc2626"/>
+<circle cx="1592.0" cy="350.2" r="3.5" fill="#dc2626"/>
+<circle cx="1232.0" cy="486.0" r="3.5" fill="#16a34a"/>
+<circle cx="1232.0" cy="500.0" r="3.5" fill="#9333ea"/>
+<polyline points="1232.0,520.3 1592.0,350.0" fill="none" stroke="#ea580c" stroke-width="2.4"/>
+<circle cx="1232.0" cy="520.3" r="3.5" fill="#ea580c"/>
+<circle cx="1592.0" cy="350.0" r="3.5" fill="#ea580c"/>
+</svg>
\ No newline at end of file
diff --git a/profile_results/real_llama3_by_model_size.svg b/profile_results/real_llama3_by_model_size.svg
new file mode 100644
index 00000000..11fabae2
--- /dev/null
+++ b/profile_results/real_llama3_by_model_size.svg
@@ -0,0 +1,177 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1600" height="1000" viewBox="0 0 1600 1000">
+<rect width="100%" height="100%" fill="white"/>
+<text x="32" y="34" font-family="sans-serif" font-size="22" font-weight="700">Real Llama3 optimizer profile vs model size</text>
+<text x="32" y="58" font-family="sans-serif" font-size="12" fill="#475569">Y axes are log scale. Missing series points timed out or were not run.</text>
+<line x1="32" y1="84" x2="60" y2="84" stroke="#2563eb" stroke-width="3"/>
+<text x="68" y="88" font-family="sans-serif" font-size="12" fill="#334155">mesh_ndim=1</text>
+<line x1="212" y1="84" x2="240" y2="84" stroke="#dc2626" stroke-width="3"/>
+<text x="248" y="88" font-family="sans-serif" font-size="12" fill="#334155">mesh_ndim=2</text>
+<text x="62" y="106" font-family="sans-serif" font-size="14" font-weight="700">strategy enum (s)</text>
+<rect x="62" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="62" y1="300" x2="422" y2="300" stroke="#64748b"/>
+<line x1="62" y1="120" x2="62" y2="300" stroke="#64748b"/>
+<text x="12" y="132" font-family="sans-serif" font-size="10" fill="#64748b">9.9</text>
+<text x="12" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.54</text>
+<text x="46.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
+<text x="47.75893682743604" y="318" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
+<text x="52.04521657000967" y="318" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
+<text x="107.67414055497832" y="318" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
+<text x="406.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
+<polyline points="62.0,286.0 63.8,300.0 68.0,265.4 123.7,187.9 422.0,203.6" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="62.0" cy="286.0" r="3.5" fill="#2563eb"/>
+<circle cx="63.8" cy="300.0" r="3.5" fill="#2563eb"/>
+<circle cx="68.0" cy="265.4" r="3.5" fill="#2563eb"/>
+<circle cx="123.7" cy="187.9" r="3.5" fill="#2563eb"/>
+<circle cx="422.0" cy="203.6" r="3.5" fill="#2563eb"/>
+<polyline points="62.0,130.3 63.8,120.0 68.0,124.3" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="62.0" cy="130.3" r="3.5" fill="#dc2626"/>
+<circle cx="63.8" cy="120.0" r="3.5" fill="#dc2626"/>
+<circle cx="68.0" cy="124.3" r="3.5" fill="#dc2626"/>
+<text x="452" y="106" font-family="sans-serif" font-size="14" font-weight="700">cost estimation (s)</text>
+<rect x="452" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="452" y1="300" x2="812" y2="300" stroke="#64748b"/>
+<line x1="452" y1="120" x2="452" y2="300" stroke="#64748b"/>
+<text x="402" y="132" font-family="sans-serif" font-size="10" fill="#64748b">5</text>
+<text x="402" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.54</text>
+<text x="436.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
+<text x="437.75893682743606" y="318" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
+<text x="442.04521657000964" y="318" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
+<text x="497.6741405549783" y="318" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
+<text x="796.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
+<polyline points="452.0,300.0 453.8,298.4 458.0,297.8 513.7,281.4 812.0,299.1" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="452.0" cy="300.0" r="3.5" fill="#2563eb"/>
+<circle cx="453.8" cy="298.4" r="3.5" fill="#2563eb"/>
+<circle cx="458.0" cy="297.8" r="3.5" fill="#2563eb"/>
+<circle cx="513.7" cy="281.4" r="3.5" fill="#2563eb"/>
+<circle cx="812.0" cy="299.1" r="3.5" fill="#2563eb"/>
+<polyline points="452.0,124.3 453.8,120.0 458.0,124.5" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="452.0" cy="124.3" r="3.5" fill="#dc2626"/>
+<circle cx="453.8" cy="120.0" r="3.5" fill="#dc2626"/>
+<circle cx="458.0" cy="124.5" r="3.5" fill="#dc2626"/>
+<text x="842" y="106" font-family="sans-serif" font-size="14" font-weight="700">ILP construction (s)</text>
+<rect x="842" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="842" y1="300" x2="1202" y2="300" stroke="#64748b"/>
+<line x1="842" y1="120" x2="842" y2="300" stroke="#64748b"/>
+<text x="792" y="132" font-family="sans-serif" font-size="10" fill="#64748b">14</text>
+<text x="792" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.26</text>
+<text x="826.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
+<text x="827.7589368274361" y="318" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
+<text x="832.0452165700096" y="318" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
+<text x="887.6741405549783" y="318" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
+<text x="1186.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
+<polyline points="842.0,300.0 843.8,292.0 848.0,254.2 903.7,203.9 1202.0,208.5" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="842.0" cy="300.0" r="3.5" fill="#2563eb"/>
+<circle cx="843.8" cy="292.0" r="3.5" fill="#2563eb"/>
+<circle cx="848.0" cy="254.2" r="3.5" fill="#2563eb"/>
+<circle cx="903.7" cy="203.9" r="3.5" fill="#2563eb"/>
+<circle cx="1202.0" cy="208.5" r="3.5" fill="#2563eb"/>
+<polyline points="842.0,134.3 843.8,120.0 848.0,129.6" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="842.0" cy="134.3" r="3.5" fill="#dc2626"/>
+<circle cx="843.8" cy="120.0" r="3.5" fill="#dc2626"/>
+<circle cx="848.0" cy="129.6" r="3.5" fill="#dc2626"/>
+<text x="1232" y="106" font-family="sans-serif" font-size="14" font-weight="700">objective build (s)</text>
+<rect x="1232" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="1232" y1="300" x2="1592" y2="300" stroke="#64748b"/>
+<line x1="1232" y1="120" x2="1232" y2="300" stroke="#64748b"/>
+<text x="1182" y="132" font-family="sans-serif" font-size="10" fill="#64748b">3.3</text>
+<text x="1182" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.053</text>
+<text x="1216.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
+<text x="1217.758936827436" y="318" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
+<text x="1222.0452165700096" y="318" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
+<text x="1277.6741405549783" y="318" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
+<text x="1576.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
+<polyline points="1232.0,300.0 1233.8,295.0 1238.0,295.7 1293.7,286.3 1592.0,288.4" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="1232.0" cy="300.0" r="3.5" fill="#2563eb"/>
+<circle cx="1233.8" cy="295.0" r="3.5" fill="#2563eb"/>
+<circle cx="1238.0" cy="295.7" r="3.5" fill="#2563eb"/>
+<circle cx="1293.7" cy="286.3" r="3.5" fill="#2563eb"/>
+<circle cx="1592.0" cy="288.4" r="3.5" fill="#2563eb"/>
+<polyline points="1232.0,138.1 1233.8,133.0 1238.0,120.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="1232.0" cy="138.1" r="3.5" fill="#dc2626"/>
+<circle cx="1233.8" cy="133.0" r="3.5" fill="#dc2626"/>
+<circle cx="1238.0" cy="120.0" r="3.5" fill="#dc2626"/>
+<text x="62" y="336" font-family="sans-serif" font-size="14" font-weight="700">solve (s)</text>
+<rect x="62" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="62" y1="530" x2="422" y2="530" stroke="#64748b"/>
+<line x1="62" y1="350" x2="62" y2="530" stroke="#64748b"/>
+<text x="12" y="362" font-family="sans-serif" font-size="10" fill="#64748b">86</text>
+<text x="12" y="530" font-family="sans-serif" font-size="10" fill="#64748b">0.49</text>
+<text x="46.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
+<text x="47.75893682743604" y="548" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
+<text x="52.04521657000967" y="548" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
+<text x="107.67414055497832" y="548" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
+<text x="406.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
+<polyline points="62.0,530.0 63.8,523.9 68.0,523.9 123.7,490.5 422.0,472.4" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="62.0" cy="530.0" r="3.5" fill="#2563eb"/>
+<circle cx="63.8" cy="523.9" r="3.5" fill="#2563eb"/>
+<circle cx="68.0" cy="523.9" r="3.5" fill="#2563eb"/>
+<circle cx="123.7" cy="490.5" r="3.5" fill="#2563eb"/>
+<circle cx="422.0" cy="472.4" r="3.5" fill="#2563eb"/>
+<polyline points="62.0,352.4 63.8,353.0 68.0,350.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="62.0" cy="352.4" r="3.5" fill="#dc2626"/>
+<circle cx="63.8" cy="353.0" r="3.5" fill="#dc2626"/>
+<circle cx="68.0" cy="350.0" r="3.5" fill="#dc2626"/>
+<text x="452" y="336" font-family="sans-serif" font-size="14" font-weight="700">pipeline total (s)</text>
+<rect x="452" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="452" y1="530" x2="812" y2="530" stroke="#64748b"/>
+<line x1="452" y1="350" x2="452" y2="530" stroke="#64748b"/>
+<text x="402" y="362" font-family="sans-serif" font-size="10" fill="#64748b">124</text>
+<text x="402" y="530" font-family="sans-serif" font-size="10" fill="#64748b">3</text>
+<text x="436.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
+<text x="437.75893682743606" y="548" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
+<text x="442.04521657000964" y="548" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
+<text x="497.6741405549783" y="548" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
+<text x="796.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
+<polyline points="452.0,530.0 453.8,514.5 458.0,508.1 513.7,438.3 812.0,419.0" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="452.0" cy="530.0" r="3.5" fill="#2563eb"/>
+<circle cx="453.8" cy="514.5" r="3.5" fill="#2563eb"/>
+<circle cx="458.0" cy="508.1" r="3.5" fill="#2563eb"/>
+<circle cx="513.7" cy="438.3" r="3.5" fill="#2563eb"/>
+<circle cx="812.0" cy="419.0" r="3.5" fill="#2563eb"/>
+<polyline points="452.0,356.0 453.8,352.1 458.0,350.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="452.0" cy="356.0" r="3.5" fill="#dc2626"/>
+<circle cx="453.8" cy="352.1" r="3.5" fill="#dc2626"/>
+<circle cx="458.0" cy="350.0" r="3.5" fill="#dc2626"/>
+<text x="842" y="336" font-family="sans-serif" font-size="14" font-weight="700">unique ILP vars</text>
+<rect x="842" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="842" y1="530" x2="1202" y2="530" stroke="#64748b"/>
+<line x1="842" y1="350" x2="842" y2="530" stroke="#64748b"/>
+<text x="792" y="362" font-family="sans-serif" font-size="10" fill="#64748b">488.5K</text>
+<text x="792" y="530" font-family="sans-serif" font-size="10" fill="#64748b">13.0K</text>
+<text x="826.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
+<text x="827.7589368274361" y="548" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
+<text x="832.0452165700096" y="548" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
+<text x="887.6741405549783" y="548" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
+<text x="1186.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
+<polyline points="842.0,530.0 843.8,526.8 848.0,526.6 903.7,520.2 1202.0,515.7" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="842.0" cy="530.0" r="3.5" fill="#2563eb"/>
+<circle cx="843.8" cy="526.8" r="3.5" fill="#2563eb"/>
+<circle cx="848.0" cy="526.6" r="3.5" fill="#2563eb"/>
+<circle cx="903.7" cy="520.2" r="3.5" fill="#2563eb"/>
+<circle cx="1202.0" cy="515.7" r="3.5" fill="#2563eb"/>
+<polyline points="842.0,350.6 843.8,350.0 848.0,350.1" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="842.0" cy="350.6" r="3.5" fill="#dc2626"/>
+<circle cx="843.8" cy="350.0" r="3.5" fill="#dc2626"/>
+<circle cx="848.0" cy="350.1" r="3.5" fill="#dc2626"/>
+<text x="1232" y="336" font-family="sans-serif" font-size="14" font-weight="700">constraints</text>
+<rect x="1232" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
+<line x1="1232" y1="530" x2="1592" y2="530" stroke="#64748b"/>
+<line x1="1232" y1="350" x2="1232" y2="530" stroke="#64748b"/>
+<text x="1182" y="362" font-family="sans-serif" font-size="10" fill="#64748b">177.2K</text>
+<text x="1182" y="530" font-family="sans-serif" font-size="10" fill="#64748b">7.0K</text>
+<text x="1216.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
+<text x="1217.758936827436" y="548" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
+<text x="1222.0452165700096" y="548" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
+<text x="1277.6741405549783" y="548" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
+<text x="1576.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
+<polyline points="1232.0,530.0 1233.8,522.3 1238.0,520.3 1293.7,500.0 1592.0,486.0" fill="none" stroke="#2563eb" stroke-width="2.4"/>
+<circle cx="1232.0" cy="530.0" r="3.5" fill="#2563eb"/>
+<circle cx="1233.8" cy="522.3" r="3.5" fill="#2563eb"/>
+<circle cx="1238.0" cy="520.3" r="3.5" fill="#2563eb"/>
+<circle cx="1293.7" cy="500.0" r="3.5" fill="#2563eb"/>
+<circle cx="1592.0" cy="486.0" r="3.5" fill="#2563eb"/>
+<polyline points="1232.0,351.3 1233.8,350.2 1238.0,350.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
+<circle cx="1232.0" cy="351.3" r="3.5" fill="#dc2626"/>
+<circle cx="1233.8" cy="350.2" r="3.5" fill="#dc2626"/>
+<circle cx="1238.0" cy="350.0" r="3.5" fill="#dc2626"/>
+</svg>
\ No newline at end of file
diff --git a/profile_results/real_llama3_dag_analysis.py b/profile_results/real_llama3_dag_analysis.py
new file mode 100644
index 00000000..03b445a3
--- /dev/null
+++ b/profile_results/real_llama3_dag_analysis.py
@@ -0,0 +1,255 @@
+import csv
+import json
+import logging
+import re
+import sys
+import time
+from collections import Counter, defaultdict
+from pathlib import Path
+
+import networkx as nx
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+sys.path.insert(0, "/home/wangkj/workspace/torchtitan")
+
+from torchtitan.models.llama3 import llama3_configs  # noqa: E402
+
+from autoparallel.api import AutoParallel
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+
+WORLD_SIZE = 64
+SEQ_LEN = 256
+GLOBAL_BATCH = 64
+
+
+def init_dist():
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(
+            "fake", store=FakeStore(), rank=0, world_size=WORLD_SIZE
+        )
+
+
+def target_name(node):
+    target = node.target
+    if hasattr(target, "__name__"):
+        return target.__name__
+    return str(target)
+
+
+def layer_id(node):
+    stacks = []
+    for key in ("nn_module_stack", "fwd_nn_module_stack"):
+        value = node.meta.get(key)
+        if value:
+            stacks.append(str(value))
+    text = " ".join(stacks)
+    match = re.search(r"layers[._']+([0-9]+)", text)
+    return int(match.group(1)) if match else ""
+
+
+def phase(node):
+    if "fwd_nn_module_stack" in node.meta:
+        return "backward"
+    if "nn_module_stack" in node.meta:
+        return "forward"
+    if node.op == "placeholder" and str(node.name).startswith("tangents"):
+        return "backward"
+    return "unknown"
+
+
+def bitset_counts(nodes, edges):
+    idx = {node: i for i, node in enumerate(nodes)}
+    children = [[] for _ in nodes]
+    parents = [[] for _ in nodes]
+    for src, dst in edges:
+        children[idx[src]].append(idx[dst])
+        parents[idx[dst]].append(idx[src])
+
+    descendants = [0] * len(nodes)
+    for i in range(len(nodes) - 1, -1, -1):
+        bits = 0
+        for child in children[i]:
+            bits |= 1 << child
+            bits |= descendants[child]
+        descendants[i] = bits
+
+    ancestors = [0] * len(nodes)
+    for i in range(len(nodes)):
+        bits = 0
+        for parent in parents[i]:
+            bits |= 1 << parent
+            bits |= ancestors[parent]
+        ancestors[i] = bits
+
+    return (
+        [bits.bit_count() for bits in ancestors],
+        [bits.bit_count() for bits in descendants],
+    )
+
+
+def treewidth_upper_bounds(edges):
+    graph = nx.Graph()
+    graph.add_edges_from(edges)
+    width_min_fill, _ = nx.approximation.treewidth_min_fill_in(graph)
+    width_min_degree, _ = nx.approximation.treewidth_min_degree(graph)
+
+    moral = graph.copy()
+    parents_by_child = defaultdict(list)
+    for src, dst in edges:
+        parents_by_child[dst].append(src)
+    for parents in parents_by_child.values():
+        for i, left in enumerate(parents):
+            for right in parents[i + 1 :]:
+                moral.add_edge(left, right)
+    moral_width_min_fill, _ = nx.approximation.treewidth_min_fill_in(moral)
+    moral_width_min_degree, _ = nx.approximation.treewidth_min_degree(moral)
+    return {
+        "undirected_min_fill": width_min_fill,
+        "undirected_min_degree": width_min_degree,
+        "moralized_min_fill": moral_width_min_fill,
+        "moralized_min_degree": moral_width_min_degree,
+        "undirected_edges": graph.number_of_edges(),
+        "moralized_edges": moral.number_of_edges(),
+    }
+
+
+def run_analysis(out_dir):
+    init_dist()
+    mesh = torch.distributed.device_mesh.init_device_mesh(
+        "cuda", (64,), mesh_dim_names=("dp",)
+    )
+    set_nccl_topo_config(detect_nccl_topo_config(mesh))
+
+    config = llama3_configs["3B"](attn_backend="sdpa")
+    config.rope.max_seq_len = SEQ_LEN
+    with torch.device("meta"):
+        model = config.build()
+
+    def input_fn():
+        return torch.randint(0, config.vocab_size, (GLOBAL_BATCH, SEQ_LEN), device="cuda")
+
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16, reduce_dtype=torch.float32
+    )
+    t0 = time.perf_counter()
+    with AutoParallel(
+        model, input_fn, mesh, mp_policy, repeated_subgraphs=True
+    ) as autop:
+        autop.add_parameter_memory_constraint(low=None, high=None)
+        autop.add_input_constraints([(Shard(0),)])
+        autop.add_output_constraints([(Shard(0),)])
+        opt = autop.sharding_optimizer
+
+        ilp_nodes = [node for node in opt.nodes if node.op != "output"]
+        ilp_node_set = set(ilp_nodes)
+        edges = []
+        dep_args = {}
+        dep_unique = {}
+        for node in ilp_nodes:
+            inputs = [inp for inp in opt._all_input_nodes(node) if inp in ilp_node_set]
+            dep_args[node] = len(inputs)
+            dep_unique[node] = len(set(inputs))
+            for inp in set(inputs):
+                edges.append((inp, node))
+
+        offspring = Counter()
+        for src, _dst in edges:
+            offspring[src] += 1
+
+        ancestor_counts, descendant_counts = bitset_counts(ilp_nodes, edges)
+        node_to_idx = {node: i for i, node in enumerate(ilp_nodes)}
+        treewidth = treewidth_upper_bounds(edges)
+
+        rows = []
+        for node in ilp_nodes:
+            idx = node_to_idx[node]
+            rows.append(
+                {
+                    "idx": idx,
+                    "name": node.name,
+                    "op": node.op,
+                    "target": target_name(node),
+                    "phase": phase(node),
+                    "layer": layer_id(node),
+                    "direct_dependency_args": dep_args[node],
+                    "direct_dependency_nodes": dep_unique[node],
+                    "direct_offspring_nodes": offspring[node],
+                    "ancestor_count": ancestor_counts[idx],
+                    "descendant_count": descendant_counts[idx],
+                    "strategy_count": len(opt.strats[node].strategies),
+                }
+            )
+
+        merge_points = [
+            row for row in rows if int(row["direct_dependency_nodes"]) > 1
+        ]
+        merge_points.sort(
+            key=lambda row: (
+                -int(row["direct_dependency_nodes"]),
+                -int(row["descendant_count"]),
+                int(row["idx"]),
+            )
+        )
+        fanout_points = sorted(
+            rows,
+            key=lambda row: (-int(row["direct_offspring_nodes"]), int(row["idx"])),
+        )
+
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    node_csv = out_dir / "real_llama3_3b_dag_node_stats.csv"
+    with node_csv.open("w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
+        writer.writeheader()
+        writer.writerows(rows)
+
+    merge_csv = out_dir / "real_llama3_3b_merge_points.csv"
+    with merge_csv.open("w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
+        writer.writeheader()
+        writer.writerows(merge_points)
+
+    summary = {
+        "model": "LLaMA3 3B",
+        "mesh": "1D 64",
+        "trace_and_optimizer_build_s": time.perf_counter() - t0,
+        "ilp_nodes": len(ilp_nodes),
+        "dag_edges": len(edges),
+        "merge_points": len(merge_points),
+        "branch_points": sum(1 for row in rows if int(row["direct_offspring_nodes"]) > 1),
+        "max_direct_dependency_nodes": max(int(row["direct_dependency_nodes"]) for row in rows),
+        "max_direct_offspring_nodes": max(int(row["direct_offspring_nodes"]) for row in rows),
+        "max_ancestor_count": max(int(row["ancestor_count"]) for row in rows),
+        "max_descendant_count": max(int(row["descendant_count"]) for row in rows),
+        "treewidth_upper_bounds": treewidth,
+        "direct_dependency_histogram": dict(
+            sorted(Counter(int(row["direct_dependency_nodes"]) for row in rows).items())
+        ),
+        "direct_offspring_histogram": dict(
+            sorted(Counter(int(row["direct_offspring_nodes"]) for row in rows).items())
+        ),
+        "top_merge_points": merge_points[:30],
+        "top_fanout_points": fanout_points[:30],
+        "node_stats_csv": str(node_csv),
+        "merge_points_csv": str(merge_csv),
+    }
+    summary_path = out_dir / "real_llama3_3b_dag_summary.json"
+    summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True))
+    print(json.dumps(summary, indent=2, sort_keys=True))
+
+
+def main():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
+    )
+    run_analysis("profile_results")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/profile_results/real_llama3_optimizer_presolve_3d4d.log b/profile_results/real_llama3_optimizer_presolve_3d4d.log
new file mode 100644
index 00000000..923ec1f1
--- /dev/null
+++ b/profile_results/real_llama3_optimizer_presolve_3d4d.log
@@ -0,0 +1,7 @@
+[14:50:20] start model=1B mesh_ndim=3 skip_solve timeout=1200s
+2026-05-26 14:50:29,648 INFO:autoparallel.api:Graph tracing took 6.073s
+2026-05-26 14:58:18,227 INFO:autoparallel.optimize_sharding:ShardingOptimizer phase profile: phase=strategy_enumeration mesh_shape=(4, 4, 4) mesh_dim_names=('dp', 'tp', 'cp') mesh_size=64 model_params=1.24B graph_nodes=4140 strategy_options=662279 option_tuples=181062856 elapsed=459.509s
+2026-05-26 15:07:42,067 INFO:autoparallel.optimize_sharding:ShardingOptimizer phase profile: phase=decision_vars mesh_shape=(4, 4, 4) mesh_dim_names=('dp', 'tp', 'cp') mesh_size=64 model_params=1.24B unique_ilp_vars=20390366 logical_decision_vars=181062856 cluster_copied_decision_vars=160672490 elapsed=462.310s
+[15:10:23] done model=1B mesh_ndim=3 rc=124
+[15:10:23] start model=1B mesh_ndim=4 skip_solve timeout=1200s
+2026-05-26 15:10:32,788 INFO:autoparallel.api:Graph tracing took 6.079s
diff --git a/profile_results/real_llama3_optimizer_sweep.csv b/profile_results/real_llama3_optimizer_sweep.csv
new file mode 100644
index 00000000..30d2e4f5
--- /dev/null
+++ b/profile_results/real_llama3_optimizer_sweep.csv
@@ -0,0 +1,9 @@
+cluster_copied_decision_vars,compute_cost_estimation_s,constraints_init,constraints_solve,cost_estimation_s,decision_var_build_s,decision_var_overhead_s,edge_cost_estimation_s,extract_s,graph_nodes,ilp_construction_s,logical_decision_vars,max_strategies_per_node,mesh_ndim,mesh_shape,mesh_size,model_key,objective,objective_s,optimizer_pipeline_s,option_tuples,parameter_b,parameter_gib,parameter_nodes,parameter_numel,solve_s,status,strategy_enumeration_s,strategy_options,tensor_nodes,total_wall_s,unique_ilp_vars,validation_s
+101888,0.4790569522883743,7038,7042,0.5384756466373801,0.6978608381468803,0.09741721651516855,0.05941869434900582,0.01627982617355883,4140,0.26339267240837216,114928,10,1,64,64,1B,75411.02054353141,0.053351440001279116,3.032338660908863,114928,1.2358144,4.603767395019531,146,1235814400,0.49132931185886264,Optimal,0.6722944700159132,18503,4139,8.946083615999669,13040,0.31402136106044054
+194792,0.48607375379651785,8080,8084,0.5489266884978861,0.7471572819631547,0.1306607834994793,0.06285293470136821,0.029804171063005924,7200,0.3148333504796028,208698,10,1,64,64,3B,155857.5709074804,0.05978171294555068,4.169702837942168,208698,3.212749824,11.968425750732422,254,3212749824,0.5855939809698611,Optimal,0.5360530489124358,32969,7199,14.472710577072576,13906,0.03955800808034837
+224240,0.49045672081410885,8372,8376,0.5536671618465334,1.1619730349630117,0.5399796243291348,0.06321044103242457,0.03362119919620454,8220,0.7288401401601732,238203,10,1,64,64,8B,213343.3574716149,0.05892709596082568,4.762722868937999,238203,8.030261248,29.915054321289062,291,8030261248,0.5859912640880793,Optimal,0.9387421838473529,37635,8219,16.452271425863728,13963,0.045778295025229454
+596400,0.5983547926880419,12044,12048,0.6777467841748148,2.653722374001518,1.875488000921905,0.0793919914867729,0.2056352950166911,20460,2.2220516917295754,612283,10,1,64,64,70B,965500.0409067452,0.0730493909213692,20.028923405101523,612283,70.553706496,262.8330383300781,723,70553706496,1.5257919810246676,Optimal,3.3026473850477487,95379,20459,50.90600106609054,15883,0.1628595821093768
+946046,0.4775047143921256,15494,15498,0.5445251299533993,2.283439102116972,1.6354041469749063,0.0670204155612737,0.17483325605280697,32190,2.005914915120229,963447,10,1,64,64,405B,3172012.7008089907,0.06962158717215061,29.85055986023508,963447,405.8533888,1511.9216918945312,1137,405853388800,2.56223003892228,Optimal,2.5583339028526098,150073,32189,77.86599416891113,17401,0.18959671608172357
+3854214,1.9979437342844903,173186,173190,4.75627763918601,11.933482899097726,4.112962566781789,2.7583339049015194,0.03040059795603156,4140,10.42051934893243,4337060,82,2,8x8,64,1B,57041.81060181375,2.17517895414494,109.2090197771322,4337060,1.2358144,4.603767395019531,146,1235814400,80.18635749211535,Optimal,8.398531069047749,107753,4139,115.10326781589538,482846,0.024392321007326245
+7135218,2.101260715862736,176564,176568,5.0140090675558895,14.759843383915722,6.347998866345733,2.9127483516931534,0.04800663981586695,7200,14.323183785192668,7623714,82,2,8x8,64,3B,122291.9385011857,2.4431078990455717,118.39831594773568,7623714,3.212749824,11.968425750732422,254,3212749824,78.84844117495231,Optimal,9.923545255092904,188315,7199,130.30269417585805,488496,0.053027451038360596
+8216282,1.9884659524541348,177172,177176,4.743945160182193,13.453818985959515,5.6245344209019095,2.755479207728058,0.04394924081861973,8220,11.563520586816594,8703393,82,2,8x8,64,8B,178228.3264244111,3.2896198199596256,123.55457829684019,8703393,8.030261248,29.915054321289062,291,8030261248,86.02262015617453,Optimal,9.262494687922299,214965,8219,135.2341975120362,487111,0.0497884638607502
diff --git a/profile_results/real_llama3_optimizer_sweep.jsonl b/profile_results/real_llama3_optimizer_sweep.jsonl
new file mode 100644
index 00000000..67428955
--- /dev/null
+++ b/profile_results/real_llama3_optimizer_sweep.jsonl
@@ -0,0 +1,8 @@
+{"cluster_copied_decision_vars": 101888, "compute_cost_estimation_s": 0.4790569522883743, "constraints_init": 7038, "constraints_solve": 7042, "cost_estimation_s": 0.5384756466373801, "decision_var_build_s": 0.6978608381468803, "decision_var_overhead_s": 0.09741721651516855, "edge_cost_estimation_s": 0.05941869434900582, "extract_s": 0.01627982617355883, "graph_nodes": 4140, "ilp_construction_s": 0.26339267240837216, "logical_decision_vars": 114928, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "1B", "objective": 75411.02054353141, "objective_s": 0.053351440001279116, "optimizer_pipeline_s": 3.032338660908863, "option_tuples": 114928, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 0.49132931185886264, "status": "Optimal", "strategy_enumeration_s": 0.6722944700159132, "strategy_options": 18503, "tensor_nodes": 4139, "total_wall_s": 8.946083615999669, "unique_ilp_vars": 13040, "validation_s": 0.31402136106044054}
+{"cluster_copied_decision_vars": 194792, "compute_cost_estimation_s": 0.48607375379651785, "constraints_init": 8080, "constraints_solve": 8084, "cost_estimation_s": 0.5489266884978861, "decision_var_build_s": 0.7471572819631547, "decision_var_overhead_s": 0.1306607834994793, "edge_cost_estimation_s": 0.06285293470136821, "extract_s": 0.029804171063005924, "graph_nodes": 7200, "ilp_construction_s": 0.3148333504796028, "logical_decision_vars": 208698, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "3B", "objective": 155857.5709074804, "objective_s": 0.05978171294555068, "optimizer_pipeline_s": 4.169702837942168, "option_tuples": 208698, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 0.5855939809698611, "status": "Optimal", "strategy_enumeration_s": 0.5360530489124358, "strategy_options": 32969, "tensor_nodes": 7199, "total_wall_s": 14.472710577072576, "unique_ilp_vars": 13906, "validation_s": 0.03955800808034837}
+{"cluster_copied_decision_vars": 224240, "compute_cost_estimation_s": 0.49045672081410885, "constraints_init": 8372, "constraints_solve": 8376, "cost_estimation_s": 0.5536671618465334, "decision_var_build_s": 1.1619730349630117, "decision_var_overhead_s": 0.5399796243291348, "edge_cost_estimation_s": 0.06321044103242457, "extract_s": 0.03362119919620454, "graph_nodes": 8220, "ilp_construction_s": 0.7288401401601732, "logical_decision_vars": 238203, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "8B", "objective": 213343.3574716149, "objective_s": 0.05892709596082568, "optimizer_pipeline_s": 4.762722868937999, "option_tuples": 238203, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 0.5859912640880793, "status": "Optimal", "strategy_enumeration_s": 0.9387421838473529, "strategy_options": 37635, "tensor_nodes": 8219, "total_wall_s": 16.452271425863728, "unique_ilp_vars": 13963, "validation_s": 0.045778295025229454}
+{"cluster_copied_decision_vars": 596400, "compute_cost_estimation_s": 0.5983547926880419, "constraints_init": 12044, "constraints_solve": 12048, "cost_estimation_s": 0.6777467841748148, "decision_var_build_s": 2.653722374001518, "decision_var_overhead_s": 1.875488000921905, "edge_cost_estimation_s": 0.0793919914867729, "extract_s": 0.2056352950166911, "graph_nodes": 20460, "ilp_construction_s": 2.2220516917295754, "logical_decision_vars": 612283, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "70B", "objective": 965500.0409067452, "objective_s": 0.0730493909213692, "optimizer_pipeline_s": 20.028923405101523, "option_tuples": 612283, "parameter_b": 70.553706496, "parameter_gib": 262.8330383300781, "parameter_nodes": 723, "parameter_numel": 70553706496, "solve_s": 1.5257919810246676, "status": "Optimal", "strategy_enumeration_s": 3.3026473850477487, "strategy_options": 95379, "tensor_nodes": 20459, "total_wall_s": 50.90600106609054, "unique_ilp_vars": 15883, "validation_s": 0.1628595821093768}
+{"cluster_copied_decision_vars": 946046, "compute_cost_estimation_s": 0.4775047143921256, "constraints_init": 15494, "constraints_solve": 15498, "cost_estimation_s": 0.5445251299533993, "decision_var_build_s": 2.283439102116972, "decision_var_overhead_s": 1.6354041469749063, "edge_cost_estimation_s": 0.0670204155612737, "extract_s": 0.17483325605280697, "graph_nodes": 32190, "ilp_construction_s": 2.005914915120229, "logical_decision_vars": 963447, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "405B", "objective": 3172012.7008089907, "objective_s": 0.06962158717215061, "optimizer_pipeline_s": 29.85055986023508, "option_tuples": 963447, "parameter_b": 405.8533888, "parameter_gib": 1511.9216918945312, "parameter_nodes": 1137, "parameter_numel": 405853388800, "solve_s": 2.56223003892228, "status": "Optimal", "strategy_enumeration_s": 2.5583339028526098, "strategy_options": 150073, "tensor_nodes": 32189, "total_wall_s": 77.86599416891113, "unique_ilp_vars": 17401, "validation_s": 0.18959671608172357}
+{"cluster_copied_decision_vars": 3854214, "compute_cost_estimation_s": 1.9979437342844903, "constraints_init": 173186, "constraints_solve": 173190, "cost_estimation_s": 4.75627763918601, "decision_var_build_s": 11.933482899097726, "decision_var_overhead_s": 4.112962566781789, "edge_cost_estimation_s": 2.7583339049015194, "extract_s": 0.03040059795603156, "graph_nodes": 4140, "ilp_construction_s": 10.42051934893243, "logical_decision_vars": 4337060, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "1B", "objective": 57041.81060181375, "objective_s": 2.17517895414494, "optimizer_pipeline_s": 109.2090197771322, "option_tuples": 4337060, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 80.18635749211535, "status": "Optimal", "strategy_enumeration_s": 8.398531069047749, "strategy_options": 107753, "tensor_nodes": 4139, "total_wall_s": 115.10326781589538, "unique_ilp_vars": 482846, "validation_s": 0.024392321007326245}
+{"cluster_copied_decision_vars": 7135218, "compute_cost_estimation_s": 2.101260715862736, "constraints_init": 176564, "constraints_solve": 176568, "cost_estimation_s": 5.0140090675558895, "decision_var_build_s": 14.759843383915722, "decision_var_overhead_s": 6.347998866345733, "edge_cost_estimation_s": 2.9127483516931534, "extract_s": 0.04800663981586695, "graph_nodes": 7200, "ilp_construction_s": 14.323183785192668, "logical_decision_vars": 7623714, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "3B", "objective": 122291.9385011857, "objective_s": 2.4431078990455717, "optimizer_pipeline_s": 118.39831594773568, "option_tuples": 7623714, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 78.84844117495231, "status": "Optimal", "strategy_enumeration_s": 9.923545255092904, "strategy_options": 188315, "tensor_nodes": 7199, "total_wall_s": 130.30269417585805, "unique_ilp_vars": 488496, "validation_s": 0.053027451038360596}
+{"cluster_copied_decision_vars": 8216282, "compute_cost_estimation_s": 1.9884659524541348, "constraints_init": 177172, "constraints_solve": 177176, "cost_estimation_s": 4.743945160182193, "decision_var_build_s": 13.453818985959515, "decision_var_overhead_s": 5.6245344209019095, "edge_cost_estimation_s": 2.755479207728058, "extract_s": 0.04394924081861973, "graph_nodes": 8220, "ilp_construction_s": 11.563520586816594, "logical_decision_vars": 8703393, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "8B", "objective": 178228.3264244111, "objective_s": 3.2896198199596256, "optimizer_pipeline_s": 123.55457829684019, "option_tuples": 8703393, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 86.02262015617453, "status": "Optimal", "strategy_enumeration_s": 9.262494687922299, "strategy_options": 214965, "tensor_nodes": 8219, "total_wall_s": 135.2341975120362, "unique_ilp_vars": 487111, "validation_s": 0.0497884638607502}
diff --git a/profile_results/real_llama3_optimizer_sweep.log b/profile_results/real_llama3_optimizer_sweep.log
new file mode 100644
index 00000000..21b02b4e
--- /dev/null
+++ b/profile_results/real_llama3_optimizer_sweep.log
@@ -0,0 +1,54 @@
+[14:16:02] start model=1B mesh_ndim=1 timeout=900s
+2026-05-26 14:16:10,889 INFO:autoparallel.api:Graph tracing took 5.582s
+2026-05-26 14:16:13,492 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=1.24B param_nodes=146 graph_nodes=4140 tensor_nodes=4139 strategy_options=18503 option_tuples=114928 unique_ilp_vars=13040 logical_decision_vars=114928 constraints=7038 timings={strategy_enumeration=0.672s,cost_estimation=0.538s,ilp_construction=0.263s,validation=0.314s,total=2.469s}
+2026-05-26 14:16:14,059 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=1.24B unique_ilp_vars=13040 constraints=7042 status=Optimal objective=75411.0205 timings={strategy_enumeration=0.672s,cost_estimation=0.538s,ilp_construction=0.263s,objective=0.053s,solve=0.491s,extract=0.016s,total_solve_call=0.563s,total_pipeline=3.032s}
+{"cluster_copied_decision_vars": 101888, "compute_cost_estimation_s": 0.4790569522883743, "constraints_init": 7038, "constraints_solve": 7042, "cost_estimation_s": 0.5384756466373801, "decision_var_build_s": 0.6978608381468803, "decision_var_overhead_s": 0.09741721651516855, "edge_cost_estimation_s": 0.05941869434900582, "extract_s": 0.01627982617355883, "graph_nodes": 4140, "ilp_construction_s": 0.26339267240837216, "logical_decision_vars": 114928, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "1B", "objective": 75411.02054353141, "objective_s": 0.053351440001279116, "optimizer_pipeline_s": 3.032338660908863, "option_tuples": 114928, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 0.49132931185886264, "status": "Optimal", "strategy_enumeration_s": 0.6722944700159132, "strategy_options": 18503, "tensor_nodes": 4139, "total_wall_s": 8.946083615999669, "unique_ilp_vars": 13040, "validation_s": 0.31402136106044054}
+[14:16:15] done model=1B mesh_ndim=1 rc=0
+[14:16:15] start model=3B mesh_ndim=1 timeout=900s
+2026-05-26 14:16:27,671 INFO:autoparallel.api:Graph tracing took 9.505s
+2026-05-26 14:16:31,732 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=3.21B param_nodes=254 graph_nodes=7200 tensor_nodes=7199 strategy_options=32969 option_tuples=208698 unique_ilp_vars=13906 logical_decision_vars=208698 constraints=8080 timings={strategy_enumeration=0.536s,cost_estimation=0.549s,ilp_construction=0.315s,validation=0.040s,total=3.492s}
+2026-05-26 14:16:32,416 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=3.21B unique_ilp_vars=13906 constraints=8084 status=Optimal objective=155857.5709 timings={strategy_enumeration=0.536s,cost_estimation=0.549s,ilp_construction=0.315s,objective=0.060s,solve=0.586s,extract=0.030s,total_solve_call=0.678s,total_pipeline=4.170s}
+{"cluster_copied_decision_vars": 194792, "compute_cost_estimation_s": 0.48607375379651785, "constraints_init": 8080, "constraints_solve": 8084, "cost_estimation_s": 0.5489266884978861, "decision_var_build_s": 0.7471572819631547, "decision_var_overhead_s": 0.1306607834994793, "edge_cost_estimation_s": 0.06285293470136821, "extract_s": 0.029804171063005924, "graph_nodes": 7200, "ilp_construction_s": 0.3148333504796028, "logical_decision_vars": 208698, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "3B", "objective": 155857.5709074804, "objective_s": 0.05978171294555068, "optimizer_pipeline_s": 4.169702837942168, "option_tuples": 208698, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 0.5855939809698611, "status": "Optimal", "strategy_enumeration_s": 0.5360530489124358, "strategy_options": 32969, "tensor_nodes": 7199, "total_wall_s": 14.472710577072576, "unique_ilp_vars": 13906, "validation_s": 0.03955800808034837}
+[14:16:33] done model=3B mesh_ndim=1 rc=0
+[14:16:33] start model=8B mesh_ndim=1 timeout=900s
+2026-05-26 14:16:47,847 INFO:autoparallel.api:Graph tracing took 11.170s
+2026-05-26 14:16:52,205 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=8.03B param_nodes=291 graph_nodes=8220 tensor_nodes=8219 strategy_options=37635 option_tuples=238203 unique_ilp_vars=13963 logical_decision_vars=238203 constraints=8372 timings={strategy_enumeration=0.939s,cost_estimation=0.554s,ilp_construction=0.729s,validation=0.046s,total=4.081s}
+2026-05-26 14:16:52,893 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=8.03B unique_ilp_vars=13963 constraints=8376 status=Optimal objective=213343.3575 timings={strategy_enumeration=0.939s,cost_estimation=0.554s,ilp_construction=0.729s,objective=0.059s,solve=0.586s,extract=0.034s,total_solve_call=0.681s,total_pipeline=4.763s}
+{"cluster_copied_decision_vars": 224240, "compute_cost_estimation_s": 0.49045672081410885, "constraints_init": 8372, "constraints_solve": 8376, "cost_estimation_s": 0.5536671618465334, "decision_var_build_s": 1.1619730349630117, "decision_var_overhead_s": 0.5399796243291348, "edge_cost_estimation_s": 0.06321044103242457, "extract_s": 0.03362119919620454, "graph_nodes": 8220, "ilp_construction_s": 0.7288401401601732, "logical_decision_vars": 238203, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "8B", "objective": 213343.3574716149, "objective_s": 0.05892709596082568, "optimizer_pipeline_s": 4.762722868937999, "option_tuples": 238203, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 0.5859912640880793, "status": "Optimal", "strategy_enumeration_s": 0.9387421838473529, "strategy_options": 37635, "tensor_nodes": 8219, "total_wall_s": 16.452271425863728, "unique_ilp_vars": 13963, "validation_s": 0.045778295025229454}
+[14:16:54] done model=8B mesh_ndim=1 rc=0
+[14:16:54] start model=70B mesh_ndim=1 timeout=900s
+2026-05-26 14:17:27,109 INFO:autoparallel.api:Graph tracing took 29.053s
+2026-05-26 14:17:46,179 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=70.55B param_nodes=723 graph_nodes=20460 tensor_nodes=20459 strategy_options=95379 option_tuples=612283 unique_ilp_vars=15883 logical_decision_vars=612283 constraints=12044 timings={strategy_enumeration=3.303s,cost_estimation=0.678s,ilp_construction=2.222s,validation=0.163s,total=18.219s}
+2026-05-26 14:17:48,011 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=70.55B unique_ilp_vars=15883 constraints=12048 status=Optimal objective=965500.0409 timings={strategy_enumeration=3.303s,cost_estimation=0.678s,ilp_construction=2.222s,objective=0.073s,solve=1.526s,extract=0.206s,total_solve_call=1.810s,total_pipeline=20.029s}
+{"cluster_copied_decision_vars": 596400, "compute_cost_estimation_s": 0.5983547926880419, "constraints_init": 12044, "constraints_solve": 12048, "cost_estimation_s": 0.6777467841748148, "decision_var_build_s": 2.653722374001518, "decision_var_overhead_s": 1.875488000921905, "edge_cost_estimation_s": 0.0793919914867729, "extract_s": 0.2056352950166911, "graph_nodes": 20460, "ilp_construction_s": 2.2220516917295754, "logical_decision_vars": 612283, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "70B", "objective": 965500.0409067452, "objective_s": 0.0730493909213692, "optimizer_pipeline_s": 20.028923405101523, "option_tuples": 612283, "parameter_b": 70.553706496, "parameter_gib": 262.8330383300781, "parameter_nodes": 723, "parameter_numel": 70553706496, "solve_s": 1.5257919810246676, "status": "Optimal", "strategy_enumeration_s": 3.3026473850477487, "strategy_options": 95379, "tensor_nodes": 20459, "total_wall_s": 50.90600106609054, "unique_ilp_vars": 15883, "validation_s": 0.1628595821093768}
+[14:17:51] done model=70B mesh_ndim=1 rc=0
+[14:17:51] start model=405B mesh_ndim=1 timeout=900s
+2026-05-26 14:18:40,587 INFO:autoparallel.api:Graph tracing took 45.218s
+2026-05-26 14:19:09,868 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=405.85B param_nodes=1137 graph_nodes=32190 tensor_nodes=32189 strategy_options=150073 option_tuples=963447 unique_ilp_vars=17401 logical_decision_vars=963447 constraints=15494 timings={strategy_enumeration=2.558s,cost_estimation=0.545s,ilp_construction=2.006s,validation=0.190s,total=27.039s}
+2026-05-26 14:19:12,705 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=405.85B unique_ilp_vars=17401 constraints=15498 status=Optimal objective=3172012.7008 timings={strategy_enumeration=2.558s,cost_estimation=0.545s,ilp_construction=2.006s,objective=0.070s,solve=2.562s,extract=0.175s,total_solve_call=2.811s,total_pipeline=29.851s}
+{"cluster_copied_decision_vars": 946046, "compute_cost_estimation_s": 0.4775047143921256, "constraints_init": 15494, "constraints_solve": 15498, "cost_estimation_s": 0.5445251299533993, "decision_var_build_s": 2.283439102116972, "decision_var_overhead_s": 1.6354041469749063, "edge_cost_estimation_s": 0.0670204155612737, "extract_s": 0.17483325605280697, "graph_nodes": 32190, "ilp_construction_s": 2.005914915120229, "logical_decision_vars": 963447, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "405B", "objective": 3172012.7008089907, "objective_s": 0.06962158717215061, "optimizer_pipeline_s": 29.85055986023508, "option_tuples": 963447, "parameter_b": 405.8533888, "parameter_gib": 1511.9216918945312, "parameter_nodes": 1137, "parameter_numel": 405853388800, "solve_s": 2.56223003892228, "status": "Optimal", "strategy_enumeration_s": 2.5583339028526098, "strategy_options": 150073, "tensor_nodes": 32189, "total_wall_s": 77.86599416891113, "unique_ilp_vars": 17401, "validation_s": 0.18959671608172357}
+[14:19:15] done model=405B mesh_ndim=1 rc=0
+[14:19:15] start model=1B mesh_ndim=2 timeout=900s
+2026-05-26 14:19:24,184 INFO:autoparallel.api:Graph tracing took 5.551s
+2026-05-26 14:19:51,030 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=1.24B param_nodes=146 graph_nodes=4140 tensor_nodes=4139 strategy_options=107753 option_tuples=4337060 unique_ilp_vars=482846 logical_decision_vars=4337060 constraints=173186 timings={strategy_enumeration=8.399s,cost_estimation=4.756s,ilp_construction=10.421s,validation=0.024s,total=26.710s}
+2026-05-26 14:21:13,538 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=1.24B unique_ilp_vars=482846 constraints=173190 status=Optimal objective=57041.8106 timings={strategy_enumeration=8.399s,cost_estimation=4.756s,ilp_construction=10.421s,objective=2.175s,solve=80.186s,extract=0.030s,total_solve_call=82.499s,total_pipeline=109.209s}
+{"cluster_copied_decision_vars": 3854214, "compute_cost_estimation_s": 1.9979437342844903, "constraints_init": 173186, "constraints_solve": 173190, "cost_estimation_s": 4.75627763918601, "decision_var_build_s": 11.933482899097726, "decision_var_overhead_s": 4.112962566781789, "edge_cost_estimation_s": 2.7583339049015194, "extract_s": 0.03040059795603156, "graph_nodes": 4140, "ilp_construction_s": 10.42051934893243, "logical_decision_vars": 4337060, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "1B", "objective": 57041.81060181375, "objective_s": 2.17517895414494, "optimizer_pipeline_s": 109.2090197771322, "option_tuples": 4337060, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 80.18635749211535, "status": "Optimal", "strategy_enumeration_s": 8.398531069047749, "strategy_options": 107753, "tensor_nodes": 4139, "total_wall_s": 115.10326781589538, "unique_ilp_vars": 482846, "validation_s": 0.024392321007326245}
+[14:21:16] done model=1B mesh_ndim=2 rc=0
+[14:21:16] start model=3B mesh_ndim=2 timeout=900s
+2026-05-26 14:21:30,429 INFO:autoparallel.api:Graph tracing took 10.867s
+2026-05-26 14:22:08,135 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=3.21B param_nodes=254 graph_nodes=7200 tensor_nodes=7199 strategy_options=188315 option_tuples=7623714 unique_ilp_vars=488496 logical_decision_vars=7623714 constraints=176564 timings={strategy_enumeration=9.924s,cost_estimation=5.014s,ilp_construction=14.323s,validation=0.053s,total=36.956s}
+2026-05-26 14:23:29,596 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=3.21B unique_ilp_vars=488496 constraints=176568 status=Optimal objective=122291.9385 timings={strategy_enumeration=9.924s,cost_estimation=5.014s,ilp_construction=14.323s,objective=2.443s,solve=78.848s,extract=0.048s,total_solve_call=81.443s,total_pipeline=118.398s}
+{"cluster_copied_decision_vars": 7135218, "compute_cost_estimation_s": 2.101260715862736, "constraints_init": 176564, "constraints_solve": 176568, "cost_estimation_s": 5.0140090675558895, "decision_var_build_s": 14.759843383915722, "decision_var_overhead_s": 6.347998866345733, "edge_cost_estimation_s": 2.9127483516931534, "extract_s": 0.04800663981586695, "graph_nodes": 7200, "ilp_construction_s": 14.323183785192668, "logical_decision_vars": 7623714, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "3B", "objective": 122291.9385011857, "objective_s": 2.4431078990455717, "optimizer_pipeline_s": 118.39831594773568, "option_tuples": 7623714, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 78.84844117495231, "status": "Optimal", "strategy_enumeration_s": 9.923545255092904, "strategy_options": 188315, "tensor_nodes": 7199, "total_wall_s": 130.30269417585805, "unique_ilp_vars": 488496, "validation_s": 0.053027451038360596}
+[14:23:32] done model=3B mesh_ndim=2 rc=0
+[14:23:32] start model=8B mesh_ndim=2 timeout=900s
+2026-05-26 14:23:46,265 INFO:autoparallel.api:Graph tracing took 11.149s
+2026-05-26 14:24:20,655 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=8.03B param_nodes=291 graph_nodes=8220 tensor_nodes=8219 strategy_options=214965 option_tuples=8703393 unique_ilp_vars=487111 logical_decision_vars=8703393 constraints=177172 timings={strategy_enumeration=9.262s,cost_estimation=4.744s,ilp_construction=11.564s,validation=0.050s,total=34.114s}
+2026-05-26 14:25:50,114 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=8.03B unique_ilp_vars=487111 constraints=177176 status=Optimal objective=178228.3264 timings={strategy_enumeration=9.262s,cost_estimation=4.744s,ilp_construction=11.564s,objective=3.290s,solve=86.023s,extract=0.044s,total_solve_call=89.441s,total_pipeline=123.555s}
+{"cluster_copied_decision_vars": 8216282, "compute_cost_estimation_s": 1.9884659524541348, "constraints_init": 177172, "constraints_solve": 177176, "cost_estimation_s": 4.743945160182193, "decision_var_build_s": 13.453818985959515, "decision_var_overhead_s": 5.6245344209019095, "edge_cost_estimation_s": 2.755479207728058, "extract_s": 0.04394924081861973, "graph_nodes": 8220, "ilp_construction_s": 11.563520586816594, "logical_decision_vars": 8703393, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "8B", "objective": 178228.3264244111, "objective_s": 3.2896198199596256, "optimizer_pipeline_s": 123.55457829684019, "option_tuples": 8703393, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 86.02262015617453, "status": "Optimal", "strategy_enumeration_s": 9.262494687922299, "strategy_options": 214965, "tensor_nodes": 8219, "total_wall_s": 135.2341975120362, "unique_ilp_vars": 487111, "validation_s": 0.0497884638607502}
+[14:25:52] done model=8B mesh_ndim=2 rc=0
+[14:25:52] start model=1B mesh_ndim=3 timeout=300s
+2026-05-26 14:26:01,331 INFO:autoparallel.api:Graph tracing took 5.531s
+[14:30:53] done model=1B mesh_ndim=3 rc=124
+[14:30:53] start model=1B mesh_ndim=4 timeout=300s
+2026-05-26 14:31:01,610 INFO:autoparallel.api:Graph tracing took 5.635s
+[14:35:53] done model=1B mesh_ndim=4 rc=124
diff --git a/profile_results/real_llama3_optimizer_sweep.py b/profile_results/real_llama3_optimizer_sweep.py
new file mode 100644
index 00000000..7e32b14c
--- /dev/null
+++ b/profile_results/real_llama3_optimizer_sweep.py
@@ -0,0 +1,351 @@
+import argparse
+import csv
+import json
+import logging
+import math
+import sys
+import time
+from pathlib import Path
+
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+sys.path.insert(0, "/home/wangkj/workspace/torchtitan")
+
+from torchtitan.models.llama3 import llama3_configs  # noqa: E402
+
+from autoparallel.api import AutoParallel
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+
+WORLD_SIZE = 64
+SEQ_LEN = 256
+GLOBAL_BATCH = 64
+MESHES = {
+    1: ((64,), ("dp",)),
+    2: ((8, 8), ("dp", "tp")),
+    3: ((4, 4, 4), ("dp", "tp", "cp")),
+    4: ((4, 4, 2, 2), ("dp", "tp", "cp", "ep")),
+}
+
+
+def init_dist():
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(
+            "fake", store=FakeStore(), rank=0, world_size=WORLD_SIZE
+        )
+
+
+def flatten_profile(model_key, mesh_ndim, profile, total_wall_s, solve_ran):
+    model = profile["model"]
+    timings = profile["timings"]
+    strategies = profile["strategies"]
+    ilp = profile["ilp"]
+    solve = profile.get("last_solve", {})
+    return {
+        "model_key": model_key,
+        "mesh_ndim": mesh_ndim,
+        "mesh_shape": "x".join(map(str, profile["mesh"]["shape"])),
+        "mesh_size": profile["mesh"]["size"],
+        "parameter_numel": model["parameter_numel"],
+        "parameter_b": model["parameter_numel"] / 1_000_000_000,
+        "parameter_gib": model["parameter_bytes"] / (1024**3),
+        "graph_nodes": model["graph_nodes"],
+        "tensor_nodes": model["tensor_nodes"],
+        "parameter_nodes": model["parameter_nodes"],
+        "strategy_options": strategies["strategy_options"],
+        "option_tuples": strategies["option_tuples"],
+        "max_strategies_per_node": strategies["max_strategies_per_node"],
+        "unique_ilp_vars": ilp["unique_variables"],
+        "logical_decision_vars": ilp["logical_decision_variables"],
+        "cluster_copied_decision_vars": ilp["cluster_copied_decision_variables"],
+        "constraints_init": ilp["constraints"],
+        "constraints_presolve": profile.get("constraints_presolve", ilp["constraints"]),
+        "constraints_solve": solve.get("constraints", ""),
+        "strategy_enumeration_s": timings["strategy_enumeration_s"],
+        "compute_cost_estimation_s": timings["compute_cost_estimation_s"],
+        "edge_cost_estimation_s": timings["edge_cost_estimation_s"],
+        "cost_estimation_s": timings["cost_estimation_s"],
+        "decision_var_build_s": timings["decision_var_build_s"],
+        "decision_var_overhead_s": timings["decision_var_overhead_s"],
+        "ilp_construction_s": timings["ilp_construction_s"],
+        "validation_s": timings["validation_s"],
+        "objective_s": solve.get("objective_s", ""),
+        "solve_s": solve.get("solve_s", ""),
+        "extract_s": solve.get("extract_s", ""),
+        "optimizer_pipeline_s": solve.get(
+            "pipeline_total_s",
+            timings["init_total_s"],
+        ),
+        "total_wall_s": total_wall_s,
+        "objective": solve.get("objective", ""),
+        "status": solve.get("status", "NotSolved"),
+        "solve_ran": solve_ran,
+    }
+
+
+def run_one(model_key, mesh_ndim, skip_solve=False):
+    init_dist()
+    mesh_shape, mesh_dim_names = MESHES[mesh_ndim]
+    mesh = torch.distributed.device_mesh.init_device_mesh(
+        "cuda", mesh_shape, mesh_dim_names=mesh_dim_names
+    )
+    set_nccl_topo_config(detect_nccl_topo_config(mesh))
+
+    config = llama3_configs[model_key](attn_backend="sdpa")
+    config.rope.max_seq_len = SEQ_LEN
+    with torch.device("meta"):
+        model = config.build()
+
+    def input_fn():
+        return torch.randint(
+            0,
+            config.vocab_size,
+            (GLOBAL_BATCH, SEQ_LEN),
+            device="cuda",
+        )
+
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16, reduce_dtype=torch.float32
+    )
+    t0 = time.perf_counter()
+    with AutoParallel(
+        model,
+        input_fn,
+        mesh,
+        mp_policy,
+        repeated_subgraphs=True,
+    ) as autop:
+        autop.add_parameter_memory_constraint(low=None, high=None)
+        input_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1)
+        if mesh.ndim == 1:
+            output_sharding = (Shard(0),)
+        else:
+            output_sharding = (Shard(0), Shard(2)) + (Replicate(),) * (
+                mesh.ndim - 2
+            )
+        autop.add_input_constraints([input_sharding])
+        autop.add_output_constraints([output_sharding])
+        autop.sharding_optimizer.profile["constraints_presolve"] = len(
+            autop.sharding_optimizer.prob.constraints
+        )
+        if not skip_solve:
+            autop.optimize_placement(verbose=False)
+        profile = autop.sharding_optimizer.profile
+    return flatten_profile(
+        model_key,
+        mesh_ndim,
+        profile,
+        time.perf_counter() - t0,
+        solve_ran=not skip_solve,
+    )
+
+
+def append_jsonl(path, row):
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("a") as f:
+        f.write(json.dumps(row, sort_keys=True) + "\n")
+
+
+def load_rows(path):
+    rows = []
+    with Path(path).open() as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                row = json.loads(line)
+                row.setdefault(
+                    "constraints_presolve",
+                    row.get("constraints_solve") or row.get("constraints_init"),
+                )
+                row.setdefault("solve_ran", row.get("solve_s", "") != "")
+                rows.append(row)
+    rows.sort(key=lambda r: (r["mesh_ndim"], r["parameter_numel"]))
+    return rows
+
+
+def write_csv(rows, path):
+    fields = []
+    for row in rows:
+        for key in row:
+            if key not in fields:
+                fields.append(key)
+    with Path(path).open("w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fields)
+        writer.writeheader()
+        writer.writerows(rows)
+
+
+def nice(v):
+    if v >= 1_000_000_000:
+        return f"{v / 1_000_000_000:.1f}B"
+    if v >= 1_000_000:
+        return f"{v / 1_000_000:.1f}M"
+    if v >= 1_000:
+        return f"{v / 1_000:.1f}K"
+    if v >= 10:
+        return f"{v:.0f}"
+    return f"{v:.2g}"
+
+
+def write_svg(rows, path, x_key, series_key, title):
+    metrics = [
+        ("strategy_enumeration_s", "strategy enum (s)"),
+        ("cost_estimation_s", "cost estimation (s)"),
+        ("ilp_construction_s", "ILP construction (s)"),
+        ("objective_s", "objective build (s)"),
+        ("solve_s", "solve (s)"),
+        ("optimizer_pipeline_s", "pipeline total (s)"),
+        ("unique_ilp_vars", "unique ILP vars"),
+        ("constraints_presolve", "constraints"),
+    ]
+    width = 1600
+    height = 1000
+    panel_w = 360
+    panel_h = 180
+    margin_l = 62
+    margin_t = 120
+    gap_x = 30
+    gap_y = 50
+    colors = ["#2563eb", "#dc2626", "#16a34a", "#9333ea", "#ea580c"]
+
+    def sx(x, xs, px):
+        lo, hi = min(xs), max(xs)
+        if lo == hi:
+            return px + panel_w / 2
+        return px + (x - lo) / (hi - lo) * panel_w
+
+    def sy(y, ys, py):
+        positives = [v for v in ys if v > 0]
+        lo = min(positives)
+        hi = max(positives)
+        if lo == hi:
+            return py + panel_h / 2
+        return py + panel_h - (math.log10(max(y, lo)) - math.log10(lo)) / (
+            math.log10(hi) - math.log10(lo)
+        ) * panel_h
+
+    series_values = sorted({r[series_key] for r in rows})
+    x_values = sorted({float(r[x_key]) for r in rows})
+    svg = [
+        f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
+        '<rect width="100%" height="100%" fill="white"/>',
+        f'<text x="32" y="34" font-family="sans-serif" font-size="22" font-weight="700">{title}</text>',
+        '<text x="32" y="58" font-family="sans-serif" font-size="12" fill="#475569">Y axes are log scale. Missing series points timed out or were not run.</text>',
+    ]
+    for i, value in enumerate(series_values):
+        x = 32 + (i % 8) * 180
+        y = 84 + (i // 8) * 20
+        svg.append(
+            f'<line x1="{x}" y1="{y}" x2="{x + 28}" y2="{y}" stroke="{colors[i % len(colors)]}" stroke-width="3"/>'
+        )
+        svg.append(
+            f'<text x="{x + 36}" y="{y + 4}" font-family="sans-serif" font-size="12" fill="#334155">{series_key}={value}</text>'
+        )
+
+    for idx, (metric, label) in enumerate(metrics):
+        col = idx % 4
+        row = idx // 4
+        px = margin_l + col * (panel_w + gap_x)
+        py = margin_t + row * (panel_h + gap_y)
+        ys = [
+            float(r[metric])
+            for r in rows
+            if r.get(metric) not in {"", None} and float(r[metric]) > 0
+        ]
+        if not ys:
+            continue
+        svg.extend(
+            [
+                f'<text x="{px}" y="{py - 14}" font-family="sans-serif" font-size="14" font-weight="700">{label}</text>',
+                f'<rect x="{px}" y="{py}" width="{panel_w}" height="{panel_h}" fill="#f8fafc" stroke="#cbd5e1"/>',
+                f'<line x1="{px}" y1="{py + panel_h}" x2="{px + panel_w}" y2="{py + panel_h}" stroke="#64748b"/>',
+                f'<line x1="{px}" y1="{py}" x2="{px}" y2="{py + panel_h}" stroke="#64748b"/>',
+                f'<text x="{px - 50}" y="{py + 12}" font-family="sans-serif" font-size="10" fill="#64748b">{nice(max(ys))}</text>',
+                f'<text x="{px - 50}" y="{py + panel_h}" font-family="sans-serif" font-size="10" fill="#64748b">{nice(min(ys))}</text>',
+            ]
+        )
+        for xv in x_values:
+            svg.append(
+                f'<text x="{sx(xv, x_values, px) - 16}" y="{py + panel_h + 18}" font-family="sans-serif" font-size="10" fill="#64748b">{nice(xv)}</text>'
+            )
+        for sidx, series in enumerate(series_values):
+            pts = sorted(
+                [r for r in rows if r[series_key] == series],
+                key=lambda r: float(r[x_key]),
+            )
+            color = colors[sidx % len(colors)]
+            coords = [
+                (
+                    sx(float(r[x_key]), x_values, px),
+                    sy(float(r[metric]), ys, py),
+                )
+                for r in pts
+                if r.get(metric) not in {"", None} and float(r[metric]) > 0
+            ]
+            if len(coords) >= 2:
+                svg.append(
+                    '<polyline points="'
+                    + " ".join(f"{x:.1f},{y:.1f}" for x, y in coords)
+                    + f'" fill="none" stroke="{color}" stroke-width="2.4"/>'
+                )
+            for x, y in coords:
+                svg.append(f'<circle cx="{x:.1f}" cy="{y:.1f}" r="3.5" fill="{color}"/>')
+    svg.append("</svg>")
+    Path(path).write_text("\n".join(svg))
+
+
+def plot(jsonl, out_dir):
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    rows = load_rows(jsonl)
+    write_csv(rows, out_dir / "real_llama3_optimizer_sweep.csv")
+    write_svg(
+        rows,
+        out_dir / "real_llama3_by_model_size.svg",
+        "parameter_b",
+        "mesh_ndim",
+        "Real Llama3 optimizer profile vs model size",
+    )
+    write_svg(
+        rows,
+        out_dir / "real_llama3_by_mesh_dim.svg",
+        "mesh_ndim",
+        "model_key",
+        "Real Llama3 optimizer profile vs mesh dimension",
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    sub = parser.add_subparsers(dest="cmd", required=True)
+    run = sub.add_parser("run-one")
+    run.add_argument("--model-key", choices=llama3_configs.keys(), required=True)
+    run.add_argument("--mesh-ndim", type=int, choices=MESHES.keys(), required=True)
+    run.add_argument("--out-jsonl", required=True)
+    run.add_argument("--skip-solve", action="store_true")
+    plot_cmd = sub.add_parser("plot")
+    plot_cmd.add_argument("--jsonl", required=True)
+    plot_cmd.add_argument("--out-dir", required=True)
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
+    )
+    logging.getLogger("autoparallel.optimize_sharding").setLevel(logging.INFO)
+
+    if args.cmd == "run-one":
+        row = run_one(args.model_key, args.mesh_ndim, skip_solve=args.skip_solve)
+        append_jsonl(args.out_jsonl, row)
+        print(json.dumps(row, sort_keys=True))
+    else:
+        plot(args.jsonl, args.out_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/profile_results/real_llama3_partial_presolve.csv b/profile_results/real_llama3_partial_presolve.csv
new file mode 100644
index 00000000..ab7b7fa9
--- /dev/null
+++ b/profile_results/real_llama3_partial_presolve.csv
@@ -0,0 +1,3 @@
+model_key,mesh_ndim,mesh_shape,parameter_b,graph_nodes,strategy_options,option_tuples,strategy_enumeration_s,unique_ilp_vars,logical_decision_vars,cluster_copied_decision_vars,decision_var_build_s,constraints,solve_s,status
+1B,3,4x4x4,1.2358144,4140,662279,181062856,459.509,20390366,181062856,160672490,462.310,,,timeout_before_constraints
+1B,4,4x4x2x2,1.2358144,,,,,,,,,,,not_run
diff --git a/profile_results/real_llama3_timeouts.csv b/profile_results/real_llama3_timeouts.csv
new file mode 100644
index 00000000..c3e6c843
--- /dev/null
+++ b/profile_results/real_llama3_timeouts.csv
@@ -0,0 +1,3 @@
+model_key,mesh_ndim,mesh_shape,timeout_s,result
+1B,3,4x4x4,1200,timeout_after_decision_vars_before_constraints
+1B,4,4x4x2x2,,not_run
diff --git a/pyproject.toml b/pyproject.toml
index 31b0df19..3c5a55c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,3 +61,9 @@ exclude = [
   "autoparallel/tools/overlap_simulator/repro_.*\\.py",
   "autoparallel/visualizer/build_display_from_json\\.py",
 ]
+
+[tool.pyrefly]
+search-path = [
+  "/home/wangkj/.conda/envs/pt-dev/lib/python3.12/site-packages",
+  "/data/users/wangkj/pytorch",
+]
diff --git a/qwen3_8b_autoparallel_30steps.log b/qwen3_8b_autoparallel_30steps.log
new file mode 120000
index 00000000..5cc45d55
--- /dev/null
+++ b/qwen3_8b_autoparallel_30steps.log
@@ -0,0 +1 @@
+/tmp/qwen3_8b_autoparallel_30steps.log
\ No newline at end of file
diff --git a/qwen3_8b_autoparallel_30steps_loss_curve.png b/qwen3_8b_autoparallel_30steps_loss_curve.png
new file mode 120000
index 00000000..c8413f8d
--- /dev/null
+++ b/qwen3_8b_autoparallel_30steps_loss_curve.png
@@ -0,0 +1 @@
+/tmp/qwen3_8b_autoparallel_30steps_loss_curve.png
\ No newline at end of file
diff --git a/qwen3_8b_autoparallel_30steps_loss_curve.svg b/qwen3_8b_autoparallel_30steps_loss_curve.svg
new file mode 120000
index 00000000..babd3d4e
--- /dev/null
+++ b/qwen3_8b_autoparallel_30steps_loss_curve.svg
@@ -0,0 +1 @@
+/tmp/qwen3_8b_autoparallel_30steps_loss_curve.svg
\ No newline at end of file
diff --git a/qwen3_8b_autoparallel_30steps_losses.csv b/qwen3_8b_autoparallel_30steps_losses.csv
new file mode 120000
index 00000000..47d30691
--- /dev/null
+++ b/qwen3_8b_autoparallel_30steps_losses.csv
@@ -0,0 +1 @@
+/tmp/qwen3_8b_autoparallel_30steps_losses.csv
\ No newline at end of file
diff --git a/qwen3_moe_mast_20steps_loss_curve.png b/qwen3_moe_mast_20steps_loss_curve.png
new file mode 100644
index 0000000000000000000000000000000000000000..8b4d9c43f227e00009f42077c6b257e19591586a
GIT binary patch
literal 19666
zcmch<1z1&GyEZxj2|+@60hJO1q(!6~MU)nh?vzHlK><;bl5Ujl?pPq*5{piyVbL9n
zf2`%Zzx{pR-us;YKj&P>%j;sU8Doxkp8L6<81s{YoFpOsZF~rV2w%Q<rU*e;F%X3L
z2oDpSvHYqu4M8%oFQ17iyCfi{ojphoXxn#ezWEb%Jv5G>Ba_P3-zwq3m-xUUCKfAl
ztLugl-rbKf<9HvhVW^iFZ?U>7b+P5LDwt6GJUIKk{qg1PuiU+hUq|<vXrsHij!vAk
zW<Ab2j}pFyhhu}+rEuShU;}@l7liLwz#rvLcL>0r*LOQWXGA0MO27|=2milaa(DW@
zWTh*cvR|mVC5GRC2L;-z)IhuizLLQ<6E*9@v3ta{Lcu|S-BIvTy>}k($hym|;QJTi
z*pUT&*RB)OAg1i~`jO3*TQlC<bMx!RetAjipX+q9o|ktems`z8-m7I~S4=a4x3t_p
z(Z4LL^&Uzs?c?y3Jg+l2cz&Y)?fuD-jc#~NoTFT(P4`4=9OQiuH_dt&mzY))Ir?E}
znN84ZmnAU3ec<jEuBY-=X$5OS;-x+7AUnHZ+qKQj%@fbv2*%fSR^1uK($S9`voyrF
zrgAS{F(ZToxY_oOm=oFt)QdEvJt<>^73o?6v$7s^k8aF3AGh`XT#30k+7l}zC}?NY
zf2i}$1DjxJYdk;IS&p6VnO<VZJsQEs#(%l)E%4VI+4#z<FG$=$p5>Rg4|U!3Ct%bh
zPWkIS%U>kOxp^L}56Qfqa_-B&8RTh2T-^@O(`U(%<Hg$DTd?(1Qfo_<v%Y$pnVS51
z$=lsKN~X(~#pfZj8BuK697kN2TGbV%mBACq@0QuWkJwCDzq4a)nR><W-~k~upPaxu
zA<x6}<LuIQ`9$pox8|>|Lw2sqPwZ-1TGFzzRzku5TppFYnKzH->xbeF;vEn3ri&c6
zeRRxdwcV7N2e+;CPG1yGR~@)U7t#uTi8yL$X=`n16Y}`Wb#a1_?pf|qP&yDl?vEpw
zYMRHs0lgCrQHput;YuL|)I?hv-Uk~^LAm5Ag^I86!XhHdR{CKLYn;4{onsN-5H5wH
z4Z=dTdAb~z3r(y|nTG{x`?-P#2OB4W7*C(4|Kb8KoT{)-K1?FZ<*D;HVM4&yChN_N
z-)z51^lr;E^gl^W;$L^>YehaWQ0Q2_SXb@oPwdvjyKcAWHkyQ`6h{};RqJ**_vK|O
zr;&?&XWV1^q`yjKq7xG8oPMh$CaxoA%(v^Zk#QP18ZXxx2xVx^6crT<y_(bhT93SU
zHnsY@k7}I8&h4_oQTps~ZEZ19`)n@N{c;3Jv^w=I|6IBKFuyGM@#ApwFbWLlmKgcS
zi0K?LF>QS;a&@S*+xfzMu|Jwei9TBKfv_5{X^G%Kxy9Bu>_Al})r?!@y@`W`&XteT
zKf5pQ+d114<SAR9rVZ?tj@o?q;4hiBWPGXhzy_=$t9nuw+j|Ftkt`rjr;alQX0<8@
zjr}4QEpRUTSjAS*v%3gEXAT!c8|!nqXvXmF_8`jPERC0LF<_uub0Q{W9{b0uXZqK7
zb6-*t(>$j^F1y^8jHZD%<hPvLU=2ySEls88j@n2~3e`BB7kMG5#!IVfOo$#XjaI6T
zHrI|gUzBKnxOPiPwHonIu_`nnWVtsXWiAjZ;&nV<C3pJ0JwVznpA=SMZ`41MY9Wa*
z)z;F&IauFOUoK1!DD~EzD)!8kiPgGGE0V2{T3oglo)JVJawaxG!{yZV{sFT;NwvKz
z?JVr&M78_uH#Q+yWQ8OV<@Ta!emY`hB?heh=7jV1T6sLX&NEk+DvG@$i``fC6|C_l
zeF-lM=G%HF58~=#ADAmtE^TCs^y>m#sObf8VsG!rs+wUmtD`+T&NR1m<kj}&*22|C
zzo)bKwX`_}1Oy}{+Y<ymg?Sfpo^aNHSGylIgTB<+3xD>o+{V&<Cc&y%?$zJWJZe4l
z`SWKdXXhhU<rI_hF6xQ$uJFmpeN4sPj;v9uskqS4J65B4D(W-3scVyn*A__Y>F=L*
zlotAz@7aiWo=)ZMs@S+tM!?ob?e~w(c9C&#tl5+K?BlqM^a*8eN<91LZogHs>Janu
z1$+F~1qHLjYlbT0<Y{~1ypBIYO1(78j3+nG=7~Rh&%wM}&o#d}J8cQnU2YFpjM@l$
z?aJVV!8xVFel$3mE3f`6FpgrOVXTj!dfI!cU)d4Z5u7Z@X}I~FbD53)0sRRD>(uHV
zcW<1>if%^R`sj{0-d2JR1M%%M@5E{u81GcIyEpO|*(H^LqX^46eEnE!hHr1VTPZ_%
z=d|2HIY;3+@}BRf01TBzXUaVD_D!r^r~I{d6IT@#(-lwhl9C1w0fxyZ*WdBm?puAG
z_vX6U@&Kvnf^W!0{3lxB&vj&O%d=Q=-MGpGd3UT5U@p~SLx&==F-|~28RU~6+@}}z
zzV3Lw5PMxj*8I?Hc^j9QP=%6`a<oBdt2o?5r}pjo!TK=(x<eY?!J6hvHXL0J;T|Bo
zonj2p&V3%ju0gbgy{)b3y)aknfk9YWkiYbdOt<Gqg@02BzvrR&1wEqe;_*5FMEPVP
z`4=)C*U21_zZPTdDc}}^PRldC85E)B&!2mq539S@{&csQ?Nq%33$btGA@wq4ztY~`
zPATGlD3~QbGnFw)PDxQ|)lOq3kSssbM=f&TS?IHG+rD}6>eaxOhrhCitde&A3q<9+
zNA_vA5@wM2F`V9=*_;$TOoQg%4GlCrM=nNZk$Cm?@<{sZp{iba(hO=r50+m|smJJl
zK&NO#Tusg+bSa-$vG>1>e+U=Z$aV>XuXB5!8W;Kogh)uX12j$)^3Lv0p?w|{Z?4jm
z!Fln^_N1%4ja2}-v#hIHIQ!*Oko1cecp)yfyZa}z6s(u~-ASNnuJuQa{CFirIrbFq
z7ZC<XuTxy3j?iye$%W_J8~)ORAE;h7|G1Z?`wEOEx=<@RBm!%**qdi};gC&Z!IQ4V
zHy}(GS*YvMf<@;St$KI@ldEws=>IyKDb4Cbr&{2QILU9iTG1>i|4gFYQoZ)LCkz4f
zCc>$~i~4fi#yjmvBQQVr+T{*!@rt{xY-|Xwo!M}jJlVp{|81I|n2_K{Ldk98eY$Z{
zPFWdDCBW!sYwu_^xoddwyO~ySWB;{xw*EcILrdu;Sro*7h$O&YIJz$WOtO7{jc0#w
zG1mmH7v&s$`eB4Jv(C;ayb?57V&cAc83pFH_UMNqe=89F_zHxu?QzWh&mm6f@aG4z
zg|_R#^&G`-2c3r62on+#gsrDU-Hvw`b9*O`(u^I*3snp@E8n%`g+(!)ysPtE-n@uP
zcFdhgkw_L3VNol2%zK|2rr&eWU#!Ib{X0N-yVaJYpL40eFj0uV!94fAIpICDCL%{Y
z;f9!Mx>q}TbXiloe~Tc*WqZia%p~zH8CYQVu=q-A;)}Nl`#3%u<0Wi~BEQ$heWksU
z8{DQ%%=kx2dTMxkV(K69^FPPCuFzvMnn5<&M=khPZx2ku2cq*D@oL3e$crSN9k9U1
zAADa6dF?9C^d+(8HW}I(c~d$h3A)l-DHG;B+=^Jz3Q6W?zIYsZx3!ow)1PEufFqg>
z7JHs`xE0WobkrW6X`^Gh5Q;o7S3TOcEqqlzSUGzxKjFH%3nz>xtRtQ3>@*u$NPdEW
z<SQ|yoVr8^H1um<8xg5G$<+4XdiQ{N)Mhrja=$NW@BtU4D(jo!A~~t#porrJ)fA^7
zCh?D|g*t_5^{<jbEe6V7jpX47o0TrJdLi#hO(1I|mv6(FOuH5oYtl?s8hK=I7wWcO
zKK8Vh%FG>2%dZp7*e`@HPs_5vTVBDAD+TGEWyoB{e&|nNEUmlHT7Q(dd)_^~GVL9+
zPv#9%XnL@hvAEx*ZxAN_k!yVx`K>}d{Db(%3^Bs;B)De?Og**KrTm6Hj1Wm3c<bK5
z5i`}kz7Rs_XfwcOtju$Gj4?=LRgEVLkzrNa%PNrAlf+*E1~73!tU1EAx$hc(BhG-Y
z-ma>uYGJSQOg5x%*L@ctl!1zLh`PvznQPeWr5YXX$js%sa=lb5u!ZKz`NO5|lGD+p
zS4!E4fPwkSjZfj$UszNa7%0uQm5?rNUgLafc`!K36v=KlqVMBKVx({XHGI~(+xvN*
zVs^qg&zn_w)xn<J+IY<iey+s;2D#*LZ}-mds@>VX1Wpr6g~E*UUy)1^k(JJyu9JIr
z0F%`^zTnB_baSb4ud01H`RWx%#<hhh|G9U~UNv5v!x6rY9w)ApW`;W@@O7jo?kPob
z`D2&ug8G03E5sLgk+tkexYJ7V??Rn=If(~LZp&HW<*-Ka`wTEy%0Y(-foXz3wemw7
zgs|UpR{j;Eov)FzeLf!~1w$$2&hgK@6!Nl2zP|}O!uni)65se~XxVzU-li*CX@?75
zIO1U9N%>JZ$YHmNw&6Ul{M_?9Vkv*J{)||&$VGyQsgecuqaj_&Bc*h2KDFcJ&{El(
z(Q->T@rBgu=!VVRH4=8+2DihhFAWP%kVWyxy8&f0c1N##Cw85Fe{hr)30nm!<Kku#
zc4#?XoT+iKba`g7?U!B3V>wYU<YD{^%<i@cyIQ2`;l^%qNHWYtfBPmvSA-9#?K;!k
zT(A5xX!tR+Y=cy1xySfdc&_qPgM}`jy3U(3B5qo_Vii&k=$;uED5@P}naPVA{K8!{
zrxaX82;kVs%m`MO(biN|;az9&dqLiLiYx8C>4Chvb{h0kcln~&eU^@{CB0j9(rKgE
zW?gqTefAkw?rX2vqk3TC(}MBno<(*o29mMfy@abp#l+mF4X1nhVQR)^wq|}~BZ1SD
z$n0(UXyI>QS6S5VBqOT_OX6xirS`?a_r~{lbgdI%$>t(G<}0Q5Bco$t9M1ZtY1`$-
zy;t6Nc$}BKUJnlsZ%6XBMBE*<9Ajc%9Lehp*hQKIQGPg|Oe}F~j`FInO}BQ4%2$bB
zoyEzfF<a`1eNIIAbKP6ldba#>Wk(5!+>nlv3f@lles!|<grd%c==GDmRj2#9H6G=j
znL0R`+s1{xC;h%?9k_mCx3I8~=2-`9r<|M||7EZAJ$rT{+1v2wf@1NHfS=kW7X!y(
zS_4iWe4<lB_7TvGk)|&!c{xU(AlI{r1I<(1`wt#y*U~VF4D0kNC+QQ`+i$6?BnC!5
z^&^R7j>^^A>1|mn4mVv`%>o|Q>bW8Z8#lL#cV-`LlWZ(|4{$)i0@|B?iJq|cCkaUw
zyl`J<8)6RYKWK%mv%cWJ5a2W2Po3F`TZ+>E?i^fGv<0*{+_ZlIk(PG<<dTz(jSa`X
z;y!R2{b$+lv#Zhh9a|tWVs2vTa(P;m&9q*EdDzWE95ceqX~Pviyl5SL!U{L%Jes@%
zzt_8%xVI-S63ge>Q|uXEk*Cd43CE{1F!)_C`(!)ST$W$tllVulJYh<fqwJH3yhZ97
zH<=wPJ%&Ui*1XhuyuuvL!k!!{D`qDO+YD8zsv@vy?{{`~)*eyc=C@1O|Jl+Et>5YR
zkt3X^t2r~{y^>XTKYU~5(-8ajnRBnMudPpOB;KdFMM~`+_RJzfHSm~TKU6B~I2*ju
zxak?dhP}?+VbT-R#9%{XHrWe5IQ4Hn4ZIA%p`EU@D#s33P2fm9)D_>Fk>3%Ld0u8s
zbKm*OQh#)%4&pWr@eqxkfx%#NN9B;&)vH%SXklhjeG#3l$N8m$eH~fM7LISH8a%fU
zhtp?v%r9cU-lm9lmN^4hOm}%I^Fk)V^r*5x^ZOolz3qnVPeISa_jTX>uLInr^saPd
zYRF)GZ9z^7BlKH*o^y3OS@`X+JN90;I}5+`8;qAPUuO2zRkNq425&Uuc-`~1$+Ye@
zvFO^`Tgv3(<2xBhtrW?hzfR05K+W6(<K^Y!J3zn>_P<#%lV9-Ljrjl<D$Qu8Qi<4|
zm@16v3Cdie_|Waw%#rMxiW9*e=L<(7Q5GWcp`nl7u2G%nkFrlx*ED!CC{#awFr}rP
zjv(~5Ar&|3OURb#`~^fBpzU?>jWN$VbR4hvMFlW;vd)&Ay{EdP1)M2Y{L8CQ5ey+y
zalB-~CD!`;JKMH4aKRT7n?DQE!zw#R3zf8IW@-!z>*@_d4jIzEA`MKVjE#m_<G8<W
zmAcJ!aJXI~pGqzwxEW!jx`z*Ui}#((J|PoA8{ir?UK6#(k#aLaW%h|9VIO99AG`#z
zM+RTxyqLU0c2CG|W1*kabfr*uX<;E^XBJ0l`OB?YY1a&`>n}SyU%s^Y{<XBURB*=?
zI46k>c$CW876>E6B3|XjqY8)TIqAQCg))Xq>!sW2dX1cBUiU>JEyB+6fjMR{pSBuk
zY4+KF9_nA-8}me?!SSpmyOg2-nDxA=86Xq87XIq~>r3EzNXbXCYJXQfVlNZ<3Fx`_
z$>3mx!&Nq=oT}j()oKGiI=?CE7!Jb^Op#sPv}3@FboKY66ZvvsF`Xe97NXnW;Rwjq
zZ-#!E<odbr!)YQK4qJByrJjV^Eayx8_=?RMw+j;o2N@X|5Il%>Kcu{EchDOzSK{Wz
zjeHbo^ZWSi7{uuB0Z;c0Yj$&3k6~Spm&h~e(Xrj`*a3tPaA5Bw-lr3>V4RnQvf3XT
z|5^&>w@K0xs;zfbI(+w3GMYtq5AeX!W%k~LQiTci8owpr#s(#DaB!>wQ*}|kNl2;C
zBd;Oe@ez!h=VB5ol%ewIOoN)}E9Qyk0>yZVr!$a*?3;<{IkKM^0iaTvd3gM=A-KGj
zx3_bYO1~lEu)h<YxwEqagg@lAfJD-i6|mve)HAESe1^|m@wb8LJMm@-`4lAJwPp!x
z5@FJeVXX1m$)2)Bsf{5+UrT#C?dft$A&gZnWGL40?M;5WDW7ktm0J_eh_-x+SFga5
zCD)QrQSD~b*2H*UU%k+$r@>LfwK;rWzPQ&}`xgGb5Jv5_*5eO?L#w3}V6H`%V<xzA
zdce|N688erC+KpF@X*m~wY#*AfD7mO(Tkp`+5jtujNv)RwfW#DDd=)|-rA@yvZ@*x
zF~R5;6B8pl>6BR}6$X5SaizAdCZWXRh0c20zJ*Iq&f`}Lzcc%joL5eR4olTbTx4z?
zJ-sSh93iaLz#djX8$kGTwQo4;^Q$AEw@u9*krv$ebRR;Jcs=%?on%9ocVU<O9O>21
zeHUF_jV~9MuQNt6OcpQuOl5z+@TRnS%@lcQF-=eROvZAmBJoVa<NTFILxx#4H>)C#
zqM0{1J4j{`U7(hGN!R+B{Fac1-f2Vv{ZpL%&7TL9t0HBo?egK9Ix5u$<61L1%j`NT
zHT;G%N1j{Dv^d1Kh3Od?SHXH&!B*%HTT?qj>C7W9wmhFyJ{|1{DN?5h0OPWmE*_mT
zcK{Cky?}k5KmL}2sBm(f1v0@I2%fB_bE2^OJQ!Kl%A4wZbA?$Np-TY@=G=_(xI7Z^
zwTC3)SB}q&`=efa>DD>%<fl1Y?Ja;sUzU6A^-bf77wC8+?{@X}!UZx_8|qw{R9Q4j
zHH<G>J!SSCL;8%R$Y#rregbwuE9kMmakAGF`*?ZxkZJA6D}U{^u-S;3&U*&!NM<EF
z;6QY%<j<CF{RFPo_f*5FscDZ{W`$A>z4pT>n1mJ6^F%;9c2WayJk^}Xb=QG+f4s2D
zN$kwy4f`P$wB|jMaUxJ_=uZOd<9LZ1IipsV(*YA=vLbXj_RCMCHf4-rUG6#k%vCy-
z!pW&$ze8Adxnw2E&BAKJKm;>(d#GrxTh+Hv3xt35l^uElvsSL%N&pd6auBo8ztG6n
ze8RCPZRq)$DO$JOsBc(zIho&Hn2zq79?}DZF^*qH_fI;m9c&V-XylTMZISix820_(
zTxRdA%(5EHdRI57zE-24XmHx9f4=ffjCN$weLK0EZZvJnLsfP3{fQ?gbJSVMQ_U0d
zw<u)@@TWL~6d8nt8w`&cKosPG-*#uX`#~~nmAwiJI$WCw(j*kNX0RH~R(G6F=<FK7
z%EA^ZDY^>AZl4(v9>s(2y}iB1rfV^abq(5>+=X#>OcJh*Jh8hES@m@M_%KBr_y%bf
z4KU50g!V7y8oQlBb)lgn;u1*Dc*vCfqs#WAl~gytFCv&TRb5(|7jt=fk}B0s9kue_
zg!Nh=Lsd^8Jt{GB2wK0z`S!HVzx$f-Yv3Vm?(djP_h8%FUQvgjJO+Atr^&wh_{y#7
zx13LY&8J?IyH!_TFJ+GT{x<w9%ZvJ+G&a^x$xx6^@L@_n@b<#`5F8cVgatAOc^^dd
z!5`7c!3P~}?d_L>Z=biuiAhM0zM04v3NvI_%p{`XpGbpK&_?8HpQWnUFR{Ork&!ZE
z01xRstGO}`=sB|9any#hEm^QC-8$bKgb8M+-Sx#k#*9^M&U%d2NQ4hbR<HuURzi6{
z(qfcwQzj$_^|U+-t&wiMTfOaS*E>%blV;7cio7IGcgF+fbL0<HoG=6g$4_=&`_^y)
z^GmRBbZfXWfWC2dPA*Utl^78)Jh|h(JaYj6xI2o8ZHrN}rUi(!9|<<WLauhIRG^c#
zcibbZqlTxzU--Mn35&jYNH-lorG9k2#G7AOC<D;6N-9}v(kJ+KB0+<;9K?#Fni9zi
zQMn(dwq3Cbg(l3{$JGh-BrIEe&hv@`KN<!r0B>FVYz#x}!9lEeZ=tW;95Ks?xB?#e
zQsfcZo4}`(**_^4$Gr^vEdaz|rK>;)I+MuCpTx9Jn+gE+BJ-$Dy&E-^9hcxQkkr7w
zwpr%#8a!Xb*=Oec384G2Ip$&jS1xW}G8YFUU$%6=O)~7`X-drO_i814@(u>Shp)x$
zPr43}zLL{5p6vh>&bWIgE6tJ;SoWPW>X|@WV?5Tc(5a$deLCRhza@zjA9pyd3w5ZH
zuH;??J*qoIB5Du0EGE^J8u}zzg=edeQjoY#n;fF2qPntrom~f3brS^9;`|O587}_(
z<s#<g=Y@oWeSR&^T=52BuM9Ks=&Yw3&OqWIP&G@b*yLx-r=jq#U(0j>q4mq~NDQ7n
z>vI3=L>bjFOoBX(Q1d+#ITG<N$7YJ+;@|5ynV8%VZC0rtyw_EaFRTQP4%c`b+S?Wz
zwpQ6XxXgOCRaoCEzd`^F+h0Q(XMd$l5TKnZAFcF~@L-N&MkH_;f;_>h&sbFzKAh5f
zxG0#!pfU87#0>ir(}}8&0U`WO>zZSg3biA>3(>cQ-3(S0Zzul9yUa$-IX3sG#@uSG
z@#g;9bm=#OQ_~vU774QqTdmS`QWTMQm2xgz7o6Wkc2(C_zs}Zbc(&sMAk2|yti&p$
zvQjmD*VvB)MBXf??>>|!CkDPN@ak%uE;2mEK>T1@OWQ&vKcUYMWa|`g;|WS8Xh2-l
zoxKP4M*kwyi;Cw#idH^wp*N=N8&Caj8Kr5Uyvsr0F&9i%y-v4tP0UoW8Jj9mH#DpD
zLabSETLyjlRPVZe7-pEcHd4Tb2>JXOX*`jzg?lvMZA^KNnqRedU-PlsTHD)?sGpR`
zc86SCIHb<MABW+gJdg^@+3C{RBFAoT_HT?)Z9}dnSMcZ#5i>^TMIcTHpd^X7p`cWR
zj!u6e=IX)Ph}#MO%gga<*k!Tnq#Of7CJ|zCg=!nn6F|?LE=qjMN>|&(n!Tws>kPE%
zo`ts&mYQ_aAub&^-%ph1h`62KJ)i%*vZ8iEcIq|#6mMsDvHR&I)z*OXN=vdN`N@%W
zW6m5RQfRl6Gd))U$56Ef-<Wq}jM{1LeKDs~`)_bbSI1^Oq5HY7rMQ8?7Dn^*l$NCD
zrIl2>OI2;T^?*@(!G*vP)eHB~$cVw?TOxNdRV)1bo+UQlxIyZEKpDlHIhv>YoKJxA
zIHE63&~2h6Ec$7|!3IxRM4N7Hy5mH{g|Sp*mTv85lDojjc!f7vX?jg920F<n30|1Y
zeXKt_JX#%J1bX~)veHzJ1_;{k`DVVSHnr=c>e*VKx|^b(KD8--^F5=|!(ZZA!=Al$
z;mz8?*AJWZ+c%4_D83{X3nUeNe36^jO|D6BgEah<i0V2erGWdIns2p20C395a-=?e
zg7cg9sFk>gHoKJvCkf`)dm}!u28rY`Sk2bc(zq?_M6*x0R@IWRgciDn6BE<$(fd{1
zq`F~u@IklJydkb}N6Aabb|uR@_BKxc_l-;-YK?K0Y#sC<DM-c5zuP(F%E@5TwQzT9
zYpi}N&CK491cbwBcbmsFE4SKO+VaPXj4$U?50F;(%X`fOE_mlh5VauiBkX#59))mc
zieP1>bvhp&?`W4_8_uB@>e&kC(NyX8msV+OYn>(z94m4Kz^se6qdqAw?i(U?*|0uZ
zYI4w+Yf6kyd4`tASRj$%8NIQ5&S7Qsmhc)HZ2^x_n5x>bFMw=(I?OKk*QDDVRW{4P
z$z8OO_uMP+jinvexgN2ci-rC0H<6`LDsk~)Rn=^;?7Z~6m`-#*+B;c541ep*%#Z8p
zFHnc24+J;@T}uxDVBHhXH^2On)3jH6VQEPt(**v`v*EGgSH)x$sB6weBqZcH5ORyT
zm5zqnVn?0g2YxKmkT1*E-;h%McCD?zfehbv>>WW{-xzfWSG%=+KFZqoF#g|X+9~TL
z@(&nGwt=~Z6oYO7(+p+Z;4Bfxn-_2?ExXRo)^Td;Jcb>P0YQWBZ;e+ns+dxOdn$Sc
zeDl2Q%I%^KsO!{EO?$(^0*+y55GWd{RLN+|zwa|Dx*J3Y?qre(E&rmNrsEslWQ1y+
zl+P`ckeVWm^x@(|GnI@oOB+$C2cUJ)r4=!ibDr)D{>K>>BOazwatGs)+6Cas9ag*&
z@rQr?mPx+mWg5!mB`TGg`ji6n^R}4EOL{pR%8;0)HZ6V!49LgxPDjK1BI%uos#jJM
zrN0Gqo2HQ3H&_9TVZYD2kd|PA_cuJ#1cuxKucV6r%pp?SJjh|?c7)tO)Z`u}<fDAA
zgTEkx-{F8Xr)lYmi4|zG1h0gsw`C<3+|mjI{nQ{a3cGC<Qo(`*b3FL*4ng-Lj){!@
zztq7L#suC8+&54u16LkVfNZN_S3<7Sm~k+;bW<$UA5%u_ygLv32Uwa^x(@MMTi~Z2
zZGInfn&~?r|0CCpzTN!&Qh8bZcXgBk0px>qw?mxz%d!T&1qNg*dPV&qBfN4M9-NbH
zAwK7H4a@+w7<rDV8!2X`A$U9q1Ym7?%nKrqLX9yX1?u-K8&p@xAjn-J6np0TThIrx
z&J+g>pYqiJWnbr^NTho1LogNGCPwogzsv@UE$8d;_k(4m1U>bA{8dV+ruODH@Kn5b
zD7K=JQMraVn8?rfIxy*rU9x1S6Yqdwe7Qpq6BMffDoilM$X}2uiRU-HzOLv7gf<4u
zjGBzpH<gJcoN9pY<m(-nt9Ko=LigoOVT_eMV2U#Io|C`Om8rsnLgiw@ur0BbO2Kpf
zx6~VR%50eEx56sb_kqL=aoi(Ni$}`7CEP{JTI6@Ip0p-L7}z&^3fm}zUaY1<kgeJk
z^+xjaGL`5)&<OA2_be%20EQ`TDq08*;DeqwzPoOO;rJ-#4faqO1mQjp#a8~H0y=*G
zin?g)n{;gSGkU~5=IIYsF`*4C&|RkosP3|-V<}42fXOD}Eb;Mt%!{3ytmTPoW1>S2
zr6~4vU>=5K;X_5zfYplsGyy%Md(Sc_x}Ry{$WQVSbcO8I{|;$sYiT*8*}edQS=EAR
zqt(ylBRLdBxD%alz3z8^nZ5))p3Gc`uV3m-xZ`(oXDAJr3s+NR<^74E$>)TZBZb>%
zlTMomxXns%P`BqSbIQd#;2Q>k_)udo7mxMK=AgZg_rQ<HuIOUB9Ur$_J%xjEBdOR>
z*2mub|DX|rPTeL!rhV!zJBlBjLS<jP7^`w!{$3q!@4DQxm{|HU6~|)4hOPsXyonkJ
z1(Sijn95|cWSH}oKgihT)$eUyY>!p(%KE_e6-X5T<3dO&<Bo9x<!d|~w+;;o3E2{O
zV-z1ndv<og(9zb`c9l+0a4J0@DM{VHz<||paIa~U^*}LW@35<&!_^7{nAOkDlVim(
zWdja7WUiSJZx%Cq(xZwqaRaDpn~Ag@Ikj!N)YX@0BDA<~8<7IbNqlp2X^9^s=^T^n
zoSjL<TUuI7I)Bmmef<16>Z#R4<lwRTTDlAc`St9-+sEhh3wBq^7i4Fa)U;=K^P0zr
zz;3KqV51S+&~zi1f#vR?`Ep6?o%ooTVduGzV`F0{Fyya|JWc(kb79~zH_@pwzkbN<
zheuSM!D&ozM|-E@5#$q8RDCBtBI1ed*h&Y3JVkHpSH|$bIvDWP(m&&ROfEB<nl6K+
z4nuzekKy17C65g#L9(BpV#FuE#+y<0WMHX#J<-b_1ELD}D&{m;#s?x(Un`>XHAYh{
z0=7<%@Cm$&057L|CR(jAsB{nK2r;#zgdnEksM>OFZXbSg{8B}}pKjKx_bax~dl$`{
zkSL=Za2=#Z^+@YHFo6;M@ifnS4QNI{Vu}GU9${?zHW?}Qmya$IgPVh4n{9l555GOm
zaGAUuVqrn6>?NabX$1tO5s8tDx($3PJ?)HSA%I>Gyl3&*UyNY|ZUpn4e-wA+J@YnK
zac{+%01&kVYQBnS5G=Vc>*zm%OgcVAcpVz6j94;0pm>r}O^)^!ePkc;iskP81wj*U
zSRp+t{p(@Rli<3(QC?V(j~Gz1A_<wFn9$58V4}E#u4+J{QUEeiFi|J>0YHe##-w0F
z9jJ4{sB>31OCV*`eYQs6VFe-jHyDi^00G8AoM-NT@gaZ?{YEgM!{=)}p{Y4upW)*;
zP}W~!<UZ4b!G90V{BP$IJOnD*q=0@CiCs=>PiW(o-GY2>gVVhD2o?+5R<>U(P!>VA
z4TkL<#@_BV7SZlsT2Kl;c&*PS^^cfDZ0I$Zn?}kObx46ig8-TnxyDV2cS;Bp<Iz0=
z2xq0I4<FS%^e@K8RaNiwoX8<dEHHv?>#}}KA3}rY<dBTS84EP_7l6G+LDb100G^Pd
z?;&WE3fvh=Z&Kb5WXB4iV<R02xC_&T=O&~^3z&c&2hJ>E#;l*{83v?=nixI(iPF+j
zv?&iY2LpRSG4WPKr24Bd#vR0ep$w~XVARiRh#-g_tfddv8&mbfzq&QFp-_Gn7&fG0
zV5<{F>5Czn16Bc&`Xz@Cq1!A%QAppjPo+>OK6phD85sT}0uUCkBmkyJ<{9wpHJ>G5
z`#7T-MGS_N;*(=R6(2yQR)w5E-(xNU2w*srg_Vc{rTBugIby$Apf|w!_;it&Vu%8p
z1nLR<Fd@*Q@)ur$#xSrTq6YwRsYt=84Zko6)ATpeHQ0KlU_^SV3%t4Y`+4Uq>f}B`
z0K|N#AIBIdO<<wu39v*yOpLdTY7b}@a<A^guMko5%|u+3^a<g-zy&_7v`T?ZqyZ2D
zJ<W(~CBOs3BNUz}_GBhKvWS0T;__yD03^^Uf)P^vw=qPg%Vv@3r2#ZpkhvtV+GBh`
z^urC*yC9#OY_1L@ueB6V+#n5yvFc@?!E(qaKRoI&$|$`h_c7;V))qBqiMa?Qxl7M&
z-F-MMbVjIId42>^x8$yiD16-IH`)15e8e|?<iwl|yaM^NZ(5CQ!L5EIvJ|>uG^a!}
zEQpuH9s`jD;<vtNuQEl<BO<5GmlcgeBmcz%DN+uFS5Ik5sy$1)yIhWqzO`vAGG8py
z=jU6mU$b&pjKgb{@Qg&(Yg&wd8k#e#oTwh2&`9Y8arxC7km&Tv6DS3wK1!%-j}o6)
z1(W<*EnR-<b<=(Qa2kYDxF{<&L_ErL5NlGZxKK`}cYr2P7s^Z{izYzoP5QL&Td`}F
z7f6m0%^qwS8^P{_iV3o=sI?IbJwUWUj;uFMFvDmEvWO;voTR!+?EE6R2UEuZ^Xg;d
z<5OR~ZTUW0-iPGZOP7xa^3>$lW7!KU5Be<#&<>4nF|)RRwu6b$&<#^Kmu)u+@Bmx3
zPPEU?C<p6l{DcxJdJCk}hXY6_cOVcm7ammiP|_Lm8Aw$I&?IPyt`D>5I=D^Le5G9-
zI}l?o2FR4p@KA8+^9&PeC+fS<>Syrm3ub;o=;3>CR0Q2i#8E7JFpXKn3#7Qw0EpkT
z%^eZxC~Q4-)cgEz$rLjpFi?CGFs^?tiM><*26iEBN-Tr}4F;ncm7GvlVQ4Lh!0>ru
z2g+6<pTwcqX!zB_249|a`6)~vMu2&cT4dkn()ZcwP&P`Q(`te4B?_IE7J*@*5EzR1
zCc?$+LV6RjM-52o`w%3Wg95g$RB*(EI>-G^4(rdomUuwtWKifgMg;~GgC39sqvLI$
zcO58rn3Mpjcn@`?DG82zQAds`Ly!*<7^^53N+5zj&lPTgr(d8hz6S3@DQb#}!)=HN
zH7W%ja4i7U*D<U_7G{wMan@>bG>P&_49zqtS3;|5kqBz8v^mhKDhfw6^)DfzBKcEh
z`_(Os6cmU7=$0$UGla%yP18XL=??X+SpUcpmZ35qHxx42vhedL=$Vr%u%i~_x>>tA
z=s_fEZm1~Uk}Ghb&JL_PpaxlvGG(H_S<s*l&47_9P=L$OrC0*?Flsh@FrM^bR)nCA
z^uZA~YD7L9U~Fg+YOK9(ih&lPMrG7hf7I1mZxXOAalsrwr2}RlnuKaE+ej**H+pok
zMgoJ#58Nag_mh|4VXHdepOAtOWA~b*nrn#D7`7fG7#_s(lp)`IS!1>-zO%%qbpTLT
zpT%3TiQuXf&bW0w?uj|3Cl|;A1@jogSm^01j(oO69;(*bA?T%5^U9@)!Q2(GN>z{p
zPg4w!<5%X(t5-i|Y(xjqh;Z%hgkuSeOjWtoO7~+RB?wfRbm=eFQMgF>O4mY6T>M8{
zufW;SW>mL;DM+%(-sVy(^$Hx6z<{=CPpOOd0V&LExZ{50<zlIo0K96qtg8>*4o+o_
zrNKNVJK4e2t)slu8}ANRnu}RlCXoDSZq{~NS&e-_Pd}o_?j?L&ldjX{Frzb0##hLy
z_fIZu<Gbj=<4?}{hDF7O92U{W+C;~W(R{~a`A%#H*y_&CJRo%<V+pDW#Xn|cz2O_q
zNOYJkE&;+X(f7{~fJx;v$1mQFZDu;25m<;<Yt3i7P+035g2F^FzJJiU^nfP1(en#@
zX3ZNA#5i!0UO=m9#tA9Pa*0&{0tbQt*Ts1S1b7I|d0m(4K!iMmMx@?sy)UYll3AM?
zI5hacv7NOOO~6(Z>%1diHbJE;JNL^t7tpt55U~_-joP^=x$b-B=DwIUupMrL6MS<Q
zF`8G84ZO7slXqqIQgU)CH8}Bez=id5xj|!ap%ftT3}w|X_9ie<UVayPcAx<2sz>@8
z{3q}v4U>}^0CJD>_?+NnK)Upz-+&II)X(P!>L>tpwCjR<6K#bdl#k`3j#8vCk_;1!
zAruA_@yYaJW}swVFEAz!dj?gk?}IQfCx3*Nby-7m@)uW8l1m2E2OW|ZrM@QD%k=u1
z9s#PQtgBWyC`S<T=>iTATH&*Zw;-%5sAmp9J7|sf`Mg8)2<4YSzzQT9FIr`f`)Cme
zEad^e9csk<($fvhR^F^1h#~-v0~PTA^y=yU!G`MLANmE}bOY6-d+O|U=@6C;^bdg+
zejM8?c_?l0G@!;B6GA;~i#Ih$euy4=>KwOT5hqH2s0frwizv}YMBs=Ree{=7se%Bi
zPr-rN>L8#1nwM-JW>H4LITRD;&xCq?Bc)8QxHf92bF3LA-|nKGOhMg`iMs!0<1mXQ
zE$S%kW48JjdW4TYfFl-EyDmH#Ci-ZEY`ibi`(p$JwLZ5%%Lp`I-h0GF0Q_yBYS}w|
z+2bCr_4%6DsQ}9ct=#~9BS#;7a*B$qOuwpO$2u5UItXrt;GB=y*Fl7)uhb_UMIs@V
zm)_)2uIg!|sF(ihh70Mzt&i!F<f|x%fmnJv_;K)GUpZ^Z0qr#IDr&U*NL<rHF>DCR
z1&bAhVph=VU6#9wGb73V54!K7DPiO7s$^Cc6wpC=0W1UrDBy&02m4D})KN|WaPTQv
zON;POZ&!d=OJATTRpFWy&I^JEzLu3UzM$9+ME$<qYq!`6`Sb}PG<Qba1uxL5?F1>#
z1Cb*85R5_G(?4V@-wn*s;(&>dveIs!&Oe9T50L}EGoK54+3B^%0oTcSm9G)@W?pKs
zd&OfTjiXt!e6HaR5N#~RZu4bTYY0baOzZy=zI+@cyIFdS?ZpoN#e3afJBU98F~xr-
zYmNlK$yxP}Qjd}79KrN8F#PRDLPN7T_*A~1g@dD*zx{t8ZhU7abL-@;)TJ)&#;OZ-
z6+S3V@)%7|8ufA6Zq}X{&W;tU4YjJ49UT+R)D@>1dkts?`Fc+SkOQ&z%}r~Nmx>u<
zQ5W<h0hO+HS@8M!%@L5IqOLvM0ugvcv5JVc{=0>SQU5LOy)Vu*-}v_Xyv19SAdao!
zdqI>s?qv<iYaZ25cmxZH&$Y1ac)hA`+rb5rrEMZDot=yq>tJzJB)VB<I+xz(m6qBM
zrF;lC?E|Th<CUEuP|ILBURO-={XZZPB=<c!hz)I~8z@OBD@~fCE&@VEa<#QrbI|Mf
znBX_y%>X=i;8)mmD%{3%98vM#aMPD(lGirZ5s|6So8=pB@fvhg>aOS2|7Rq%$RMaK
zFsdMq%N10wVBjkg1J*~I+w%i2b78?K-J<BKjxdJ!M}~R25fMkVXGRTmGb1AZYCljR
zQS0jB^#xT*o7ly5K>WtF04s-lgeR`0pzJp^{Ei!I1+9v3JZg#-bDU6=#`|=I8q)zc
z0pQm+V{#M>Ldu_`)X~bTv0B;l;~V<0rc+&ICX}BrCZ*gR4GL}(N~IGgkt`OoD70ww
zT47Kv$)o-+91kj@Jq^5Bl=tq>rP}X_T8<+e3R!p4DA3#mDk_TD)MvJOBx%8eVs}um
zf3|up8YZ0NE^+jX)XzjwT1q4)(hZm`S}Upiy_{LpRnR1<@!d^J&O3cMz@Y`PFscj_
z1Qz<}?hJ+rF1lxL2TahU37Y%)h7(PRAX|sBTPPP8&{4dJYsw&e2SM97Ryb>DKyV4<
z#v(({$jbL#Q8H8`5YA5%y)*RDhXE74-{_+@aIO@6v?Ut<&I%0^b@5HySikZ>AQe26
zVPHOjX4Oznqd?;yO4MlCHhFD#@u0#@A{hPN(A%e0I7N^DOojq8W4Dxjv0XuArpe2i
zRdkTr=gGik9NhX*iZgdT+Cs2AMftBoWT^f_IVk=PoVvfS0ROdamRbAHujc~Q<}j=;
zCgb~!yC4z_MMVsH!aP7c@BK&#dajFZWSFp`Tmj%FgDAvq1(JnDeMAx6`Bmw&;Hx)E
z@+45p59PgMu|RnbAowu=fN;DFEk|m^tfl1W4iqtIn6RSQACv-3Ttl;YDCHVBLXR=!
zCOATiR?4Fi%QbY15t7}OXf_U6VzQQ^nL9LwuVK>r-6#|>;vQN^|5KHe*I~g0aRD%=
z9@pbita>8Sf=9i^ARmd_um`#V9o+4X>rij9@Bb&BH*qEzl6?E*xbq#gh4ob3bow%M
z`Q0B>kr|GFaX>lb{A@1-U{aH(^|(ZROBw_>ZrtFuo*9QZSAxwISj>?YgIP^+2bw+n
z;O0i6pO4SvxG9XbW{5+A0*9^~OA~WpNEGV8%+QS;zKc@-9bw7(=pi?^9Nzr=Je$z|
ze$Ne3Qi~A`v$y{<*ZY&cVAVHlH$Zu@v{LF$;4sR~vwQcj%eowBZm8qGhxIsfKjEfH
zFa{-~_#;@X&k3XCkDUG^D~HYdCzeHigTGJSl&xa*7jwn<V@mStW~kQw8{QZ12*&4g
z@vT=hxffggAA!1~hr2t#?(kSmkC(U_(miY6^kk5YH(UTWL$#7o@C>6+l;>MWZwm^O
zL7~bX{0i97B0V8uV5@6WNX2#AYqz)Y`qPFB4R5o2XBzO7USU;ie$W4=1B+s>AhUSr
zkV>m#m1aB=r|TSV0YdRmqX2*+{yQj}>!a~E6j7y(`=BMx06@{Hb9uQ9T0q@XlC==R
z-8WET`^T;Mn@ly*tgsvC9d+ME%io3{imbPbe)~Ka{fB5Y8YtET<ZsRO9`cBFe?h(J
zX%Wm-9sSN}DYN$@!har&r8fO@GO=`&MH;1kMcz)a##d1)Pt=4gVL>To<Es@@_1Eox
zZp0eUl~t)%k3#i!P7Sd0amjd)L;!bP9~X;yfC69Mhvk2_il}j2cRG`+=UQAJJ}+7X
zNoqoxQebKLgC_qfRC5N_2Nd8LdwBmd(IDXK{t)odcdvEnZNs}>r%j=U)Uly452+b1
zDY=}aYKTU(&{IPh85%QIew0Ft`I%-GX0{RHi%WRMxUfc;XbAQ&J$bvf!69;u|E-?A
ze@VJr0>6dFWnC;ONEtd=&PsXt6EvHDa;JFK`TSQi^7{+D%0k?LnQ8_<+rAi%C&yX)
z?8n18&ODVNw?~S9WXYl<ibbz@fTbjJGu2BjEHT>2B3CO0h5BHF&>)}5?w47d!;>8I
zMf^=`Yu0#Nc$Uiev6m-^k&8RC-x%M!x-9cuk8wUK|FFoI-wH>w0<m*=n4{eAB|RX9
z{_oL5L~MW}=6Z&OE=NQFsHejX&~m?dk|rM1wwU9&f01<+WJ$S8W<T->;jpJlY;cBg
zRUX@PZ>$l<YjRsye$!XKhF*l{CuD39{S5sJw_vR;ElB%~;TbH+UpfuL!gqc(HG?{7
zp)(>Yol=+PE^H9XPUds|7Ix95O*vh(w&(k+{_zja=rYN!5>P3a%<G6=V@QclK;V+w
z@I1S8xqI}@6YJeP5C`|%T^JE=gmaaWe0N{&NA2BFyQ2##Q8omQL&7ee-@i|PNb_i_
zxA)+5cBuU53=kx+>QZD~<Q-dv^YiIoOCt~%Co|0R(Q`ibg?%I;GlkzwwPuF$RfMTM
zs3z+Q)Aou<V0p}I{HR3a!KY6zJ#m9S1_dPwIRai&+E=2bL|$NR>^@YCMq<70jD8&w
zyT#KX(|*&7iWQZW=|ih{=`#T!3Ba49d$vw4yTqn;I;Xz?>V*I|$<wTja}bp}J9zN#
zd2576yfN+U^+E$2s$ls_x(u2>J%tx4dCnb)yw0B7Pd3bqVog}xT*|O-^szyzf?{V!
z--#!eWzz62GB*dr8;?s6A_KfRFgjnS!aNb+%lU%3{yg!B^^_afjpk!lo_1hB!J<RA
zz~X*Xw|ydN<WSr+`GEHU0|T?Ruk7~jRT6Q*tHi~~?B=_7_kejt{>iJL;)Vbteg0%o
z3cOBFUT5xz25`0ap%ffaOrzDbGBy9RXof&BS6kF6J4?{wRTocaf>vKh#=zo_?m|lA
z%74_g@mfXrAB}$$62@ZD>)4Jqnvddl;70kV7%qsJg_h>kp;F8H{ruQy<Or3TknRC&
zeck9!yMt09)f#mF4N^g7>9CLQ$fCH=R<;qdEIikdeA%wz2OsG399~{s6$=P$|06)e
zLXXbhpb5x7$JhSNIr=}o1Zt3G>Kto5GiTy<9nHPQ?1n0)PjMYKIT}?-A6_u4S2r9m
z15d%H*G^@v<?5$T+U~?TN~LS4Vzb(tQ~9HJK(P&ou{aQ+V#lBy@Ut===(J|W*q4o7
z%}N{pnOkGaxq$5FV5{V5r*;8PB!USC+>w*27Ah+x<z#0!I74c;ScB9T368&^P@s}Z
zPVKbGr<XqnZc4%YTuy%c55+wIWgzFR-Jp7Wz(YZKIR?Ab`#MOAwONff)SANd*P||V
z#FbdIb9Z;Iz8D?F1(@xQ$`9G!Z8b4UOic8S=RgjR?=4z9xy|FM0(<~rS363<!RG0F
z$3Xh>T-?WuU`e4^15~+ytj}ef$nwicP;EMpA}9O1eZB0iS`-|)4(@;g73@y(LAkD<
zRyA^ed`}(5T0`bm={dPHhy*28sG_s~A!dxPf_g*z_k}N(7yNKsugf7&dI4%X$}Q(*
zoOyVFiW~K-e6!59uE+V{F9Npy{><w%lp@c32h!e<%^yWTl?>`}|6Nx0|G7B)-5{0F
zzgDIl^f!2pHkeomSagk^WVJ2}9MP0{VSKKoCI7~#n+Y5zDGGzvpfC#Ts+8UN>=<zV
z=y0)HRVvz0sM31&qON<im3cT@jq^=U%PQDJAf+0yc}+(j<VoF$kCDE1<;<hjB@V3`
z*xk}1jM^I1^WRO+M?ZV&?tl3*y;<mv)4q9!LdaKSRHgPOsJ2Mtu@?tPieQAM`sFb2
zAT4;TDY?!kNpUM3&PqHBy`DbZnkx0W)%ACYS$b|!&h#0PYy!V~53#Qt)tDBTVA9M!
zChf3;Yd0TvG1w#-o^iA4KC#%V5MmxZ&v%em?VBSr*+0$>3rVILD>j+%du7Pb%WEAn
zi-m)W>dJ4J%EsCE=)p1S`5IH(jbPgW>DA>4a|(V*;u8Gb{hr6pr&WUwp3rOpo5$6+
z6@K(9og_W5BnK2`*H+hz<Y}m{iszaCSL3)O+6)6#_3PwfpfvlK>@Z(Ff3SR=C$iuL
z2H5C9$+NXFTdN87s)0H<F0*01-cwd73gZV_`&4`18K{nHmhsTDG*b8Tx{=k7|IB4O
z(2bCikzI6`@y!x%raI869Gp6O*mjf$gvm1vztJj~kem&P+g^iPfIcRjmgg3Z7je<k
zAA%x#$>78s&DxqlGRC=o4JhOYA+6mR*xZ7f!wTJlod&|a3-VpR-Rodt2X(w#DPdL`
zUK?A>qNDr8upaZ#jB5F$DU^hOk5AAuFl>!GzuAjxrBq=9VaM~HYJy0%F1H4-&#7hA
zzLj|-pK`dux;F>ZzAM+<#cuhx%sN^A3k)#g&-ZkXsMfw`Uf5gp5>Sh&=mTh~$Gzrk
zZ%=)2Q7JWQAhgVOC0W?L!M<HSlC5O|>^~=<t8qrM$Xjo{$KG{3t?O;@y?m%05qbaq
z{j#&~rhD|XI{ei~bHU}yY&uDY>mKQWiaxcq@g(968Ilqb-J@=}O%nYByc>@F-`L#v
zMMl9i^JNaL|M=@ajqv3<dtV>Xe))eEhV9Z^KISS&D(|m+WYL}2J;^NTbfRxp(lNL2
z@G`c}T|{udubxe}Y|qvi{I{4{Xogj8ZVZ(M4mzr9{v(Y0A6fY&h0?ED`wK!ub_1kD
z9@F*-fl0{zaxQiUqMcr&_^)5T;t)|%pDe5bKF4Wn_o{{}_cRZ|@U2NQ-Fx<6UA@#T
z#aHX6KM5%Bj;+;O1>1>1tplq@O^G$ad<%KAHguN329*C2MCW}83Iv*6tiMN8?b-G1
ze~b`EMA{D6_XH-{4zDNqYVBV`YH99?FN4yq1UdG6{J%@h&Wrxeg(WJ@*3#<KixD7%
zK$_pNQADsesq#3PG?U_z`!&#q1+dMWwoPWD$~e<vsi*h-Z6cbVlW)CJK?U*>s3a@Y
zs-OQ;Nf!3>%-zl8RZZ5u@ismwDe2ktHb?Ctb=TVJ3D^gk?0x#l4m*Rw7t1*`a7Ms8
ze{4sBJ-t}qhhzU7&{v?f{pD~rhhQ4C>!65=G5<@x>i?}h{eOVSI1cMVZ(8KTSsZUd
zcZ7JEwL$bzN5)cX2jB6UFuXxFFj>;ebkS|_I|wQ^Y?0mMNJ}2{2NwD5pYS?Jsd=BD
zjDlL<!X%+A2fA$FtouZOC~D(7kCBsguo=d{!0?EMSFyyU@v{UBWP`mx$V&Vp*y<d`
zs%_8`>a|6|^w4hw_~T%sU~0pq%4ybnY!6Ty?FqI_q@|^01W{h2okD%QC4-jyXm=5O
zeFC6*_=7-6uyYW6W8$DI%o9}l^Dk2KIyn3o9tQ<Fec_X5LnFO4o+sle^M;2TA}oGt
zMMmi)21gl@68IFL<4Kq8pC5Fq28g&E>%2SHhFM;k<z`&o(v#^4`<?mqM!2vQ+)@EX
zbL;U>37kgZw~knIYAX2`wdRYv?MF5`#Ve|GE&cak1cbu3Z+kd^4Hd$!_Rd>E59v4t
zeo9*9kgL7Dtbh|N%jneWcgGRNE4jRF4hTwove(jCnj6(KU4fJ*CO%4ZCtC5g1T%eP
z(_k_Ey>s>6WK>`B&RwR{uJU!*xnLcrrB4(ni~P(2KEgp_aJqrbJZhOBY_?yWD$stu
zJ?)nbL)IH-p7Vp#EUyIhW=CL4a<ni|iB3RXD09X@2w8&Uy^mqGPPrm5F`%m1;ShX#
z<Tjr}WAD$&(Zbom%Pbs6iwg7W3-j|jHE9_jtqi^z!*%=%hd9vv#O_3w66at`{T%BO
zd?1AI)^@r4jNmEl+3`+zG`lWWmS%G=yj;{PS<KTNY;`{%c0X7<m^zULd){FVFM~iJ
z6jdPqX4Y*nE@&46R8=>u4btUinxf9GP0WLxcCn|~7*P2)khVA6IbzPc*aqy3G~?Xu
zbgDH|K3RI}RuwG`*zhjDoSNo^^%7tM*xN_gP}S7DIYST_62hK+HT2N5C+6ERuEc>z
z1^Cnu_+EwCK*2ir9Ex(ba)XARgz9L|X@EBD)@@3PBZ(tn@sA5eVE`Wz63%s+&UY~$
zNMGrg1U<V;BltNuxbtgkvtAQ~4&((t2=AU;*RiwPjlNvh1@)saRaWo`5P_#3t|5)h
z>2I5}+S)x{Z6&0kUo>RMU%uARIRbXZgZ&|(fLh39VXVc>{zEWTr33QIcZr!%J{CsC
zmw3fi7s;pcorQ*q`A~`%(0p61>%_GF{*n&s!@}MTb$xw(AYTSzMJ!%XNBB$5Z{PBH
zua@jT;^1K9U5IPC2R>*3w)~t~B^OrYYsN7+7reH(Bu6FCq~~A%Qf(Q9bL}-?U8XtI
z>`}0FPcF0JEO#P$9azu>i*r?WU<YOt+#WswGyMHaykO?${zc3_Zy;DEP!LZqn@CD!
zcQC$30ZO{@LtIk4$G`_N0)nZW!Ive#o_*l-5{9^_W$9tiJ$vWj>ie_+d=jX$@bGms
z(U3|1Y?Z5|gG0|n(MPZ+sxPKg!7I_8!gskx_XZi6=SE8`AB=r<SE6?R=3gLF3o3Sa
z*+48vUC9f4ucC90oUJ>N*>su6>Z#4FYzQggNrf}ei1q%<<@xznl`3%X=@ej12Kayr
z+6%T!oDtJFt{+l^PnaMkpV-W1RFiyH3aGfi3CUBae+Bm24en}!4J5s*k%nyv{$P^}
zK!REz*0cL7EHJrvdEo|Ij<k^<Dxa=?veM*642InyXtaL&^a@mz1AOorF{{aP>yi2)
zQ*!T}etvK5oM(;Tqdw~Akr<6QsI2$uW=C;2z!##^A{_(8UsKr9&G%@YoMa_|<gL#d
zh=|^qj%3!hTi(P9(scS5<ar|IGutI%vb@)s>+lLnar+zY)j+kxzlBx*C5rT~5!e6Y
bOBb}w9_0f@gzulAUh`5y?pcAj-uwRr+*T%z

literal 0
HcmV?d00001

diff --git a/qwen3_moe_mast_20steps_loss_curve.svg b/qwen3_moe_mast_20steps_loss_curve.svg
new file mode 100644
index 00000000..7fc6c0ca
--- /dev/null
+++ b/qwen3_moe_mast_20steps_loss_curve.svg
@@ -0,0 +1,68 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="920" height="560" viewBox="0 0 920 560">
+<rect width="100%" height="100%" fill="white"/>
+<style>text{font-family:Arial,Helvetica,sans-serif;fill:#1f2937}.tick{font-size:12px}.label{font-size:15px}.title{font-size:22px;font-weight:700}.grid{stroke:#d1d5db;stroke-width:1;stroke-dasharray:4 5}.axis{stroke:#111827;stroke-width:1.5}.line{fill:none;stroke:#1f77b4;stroke-width:3;stroke-linejoin:round;stroke-linecap:round}.point{fill:#1f77b4;stroke:white;stroke-width:2}</style>
+<text class="title" x="460.0" y="34" text-anchor="middle">Qwen3 MoE 30B-A3B MAST Training Loss</text>
+<line class="grid" x1="88" y1="482.00" x2="884" y2="482.00"/>
+<text class="tick" x="76" y="486.00" text-anchor="end">9.5</text>
+<line class="grid" x1="88" y1="412.00" x2="884" y2="412.00"/>
+<text class="tick" x="76" y="416.00" text-anchor="end">10.0</text>
+<line class="grid" x1="88" y1="342.00" x2="884" y2="342.00"/>
+<text class="tick" x="76" y="346.00" text-anchor="end">10.5</text>
+<line class="grid" x1="88" y1="272.00" x2="884" y2="272.00"/>
+<text class="tick" x="76" y="276.00" text-anchor="end">11.0</text>
+<line class="grid" x1="88" y1="202.00" x2="884" y2="202.00"/>
+<text class="tick" x="76" y="206.00" text-anchor="end">11.5</text>
+<line class="grid" x1="88" y1="132.00" x2="884" y2="132.00"/>
+<text class="tick" x="76" y="136.00" text-anchor="end">12.0</text>
+<line class="grid" x1="88" y1="62.00" x2="884" y2="62.00"/>
+<text class="tick" x="76" y="66.00" text-anchor="end">12.5</text>
+<line class="grid" x1="88.00" y1="62" x2="88.00" y2="482"/>
+<text class="tick" x="88.00" y="506" text-anchor="middle">1</text>
+<line class="grid" x1="129.89" y1="62" x2="129.89" y2="482"/>
+<text class="tick" x="129.89" y="506" text-anchor="middle">2</text>
+<line class="grid" x1="213.68" y1="62" x2="213.68" y2="482"/>
+<text class="tick" x="213.68" y="506" text-anchor="middle">4</text>
+<line class="grid" x1="297.47" y1="62" x2="297.47" y2="482"/>
+<text class="tick" x="297.47" y="506" text-anchor="middle">6</text>
+<line class="grid" x1="381.26" y1="62" x2="381.26" y2="482"/>
+<text class="tick" x="381.26" y="506" text-anchor="middle">8</text>
+<line class="grid" x1="465.05" y1="62" x2="465.05" y2="482"/>
+<text class="tick" x="465.05" y="506" text-anchor="middle">10</text>
+<line class="grid" x1="548.84" y1="62" x2="548.84" y2="482"/>
+<text class="tick" x="548.84" y="506" text-anchor="middle">12</text>
+<line class="grid" x1="632.63" y1="62" x2="632.63" y2="482"/>
+<text class="tick" x="632.63" y="506" text-anchor="middle">14</text>
+<line class="grid" x1="716.42" y1="62" x2="716.42" y2="482"/>
+<text class="tick" x="716.42" y="506" text-anchor="middle">16</text>
+<line class="grid" x1="800.21" y1="62" x2="800.21" y2="482"/>
+<text class="tick" x="800.21" y="506" text-anchor="middle">18</text>
+<line class="grid" x1="884.00" y1="62" x2="884.00" y2="482"/>
+<text class="tick" x="884.00" y="506" text-anchor="middle">20</text>
+<line class="axis" x1="88" y1="482" x2="884" y2="482"/>
+<line class="axis" x1="88" y1="62" x2="88" y2="482"/>
+<text class="label" x="460.0" y="536" text-anchor="middle">Training step</text>
+<text class="label" transform="translate(24 280.0) rotate(-90)" text-anchor="middle">Loss</text>
+<polyline class="line" points="88.00,79.02 129.89,81.14 171.79,85.61 213.68,92.24 255.58,101.13 297.47,112.38 339.37,125.14 381.26,140.13 423.16,157.33 465.05,176.44 506.95,197.38 548.84,219.78 590.74,244.26 632.63,268.22 674.53,297.78 716.42,325.93 758.32,358.37 800.21,389.95 842.11,422.79 884.00,459.42"/>
+<circle class="point" cx="88.00" cy="79.02" r="4.2"><title>step 1: 12.37845</title></circle>
+<circle class="point" cx="129.89" cy="81.14" r="4.2"><title>step 2: 12.36325</title></circle>
+<circle class="point" cx="171.79" cy="85.61" r="4.2"><title>step 3: 12.33137</title></circle>
+<circle class="point" cx="213.68" cy="92.24" r="4.2"><title>step 4: 12.28397</title></circle>
+<circle class="point" cx="255.58" cy="101.13" r="4.2"><title>step 5: 12.22048</title></circle>
+<circle class="point" cx="297.47" cy="112.38" r="4.2"><title>step 6: 12.14017</title></circle>
+<circle class="point" cx="339.37" cy="125.14" r="4.2"><title>step 7: 12.04897</title></circle>
+<circle class="point" cx="381.26" cy="140.13" r="4.2"><title>step 8: 11.94193</title></circle>
+<circle class="point" cx="423.16" cy="157.33" r="4.2"><title>step 9: 11.81908</title></circle>
+<circle class="point" cx="465.05" cy="176.44" r="4.2"><title>step 10: 11.68259</title></circle>
+<circle class="point" cx="506.95" cy="197.38" r="4.2"><title>step 11: 11.53297</title></circle>
+<circle class="point" cx="548.84" cy="219.78" r="4.2"><title>step 12: 11.37303</title></circle>
+<circle class="point" cx="590.74" cy="244.26" r="4.2"><title>step 13: 11.19815</title></circle>
+<circle class="point" cx="632.63" cy="268.22" r="4.2"><title>step 14: 11.02700</title></circle>
+<circle class="point" cx="674.53" cy="297.78" r="4.2"><title>step 15: 10.81583</title></circle>
+<circle class="point" cx="716.42" cy="325.93" r="4.2"><title>step 16: 10.61479</title></circle>
+<circle class="point" cx="758.32" cy="358.37" r="4.2"><title>step 17: 10.38304</title></circle>
+<circle class="point" cx="800.21" cy="389.95" r="4.2"><title>step 18: 10.15753</title></circle>
+<circle class="point" cx="842.11" cy="422.79" r="4.2"><title>step 19: 9.92291</title></circle>
+<circle class="point" cx="884.00" cy="459.42" r="4.2"><title>step 20: 9.66127</title></circle>
+<text class="tick" x="98.00" y="69.02" text-anchor="start">12.37845</text>
+<text class="tick" x="874.00" y="449.42" text-anchor="end">9.66127</text>
+</svg>
\ No newline at end of file
diff --git a/qwen3_moe_mast_20steps_losses.csv b/qwen3_moe_mast_20steps_losses.csv
new file mode 100644
index 00000000..cf58cdd8
--- /dev/null
+++ b/qwen3_moe_mast_20steps_losses.csv
@@ -0,0 +1,21 @@
+step,loss
+1,12.37845
+2,12.36325
+3,12.33137
+4,12.28397
+5,12.22048
+6,12.14017
+7,12.04897
+8,11.94193
+9,11.81908
+10,11.68259
+11,11.53297
+12,11.37303
+13,11.19815
+14,11.02700
+15,10.81583
+16,10.61479
+17,10.38304
+18,10.15753
+19,9.92291
+20,9.66127

From 2f4f102e60accd248dde4ca8c0a2557746ba7a0f Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Thu, 28 May 2026 12:44:47 -0700
Subject: [PATCH 03/27] Add LP relaxation support for sharding optimizer

Adds LP-relaxation lower-bound plumbing and initial DP topology construction coverage, while removing generated profile/log artifacts from tracking and ignoring future outputs.\n\nAuthored with Claude.
---
 .gitignore                                    |      2 +
 autoparallel/optimize_sharding.py             |    207 +-
 autoparallel/serialization.py                 |      1 +
 ...ama3_3b_ilp_node_indegree_distribution.svg |     51 -
 .../llama3_8b_4x4_strategy_full.json          | 287470 ---------------
 .../llama3_8b_4x4_strategy_summary.json       |   2054 -
 .../real_llama3_3b_dag_node_stats.csv         |   7200 -
 .../real_llama3_3b_dag_summary.json           |    883 -
 .../real_llama3_3b_merge_points.csv           |   1668 -
 profile_results/real_llama3_by_mesh_dim.svg   |    167 -
 profile_results/real_llama3_by_model_size.svg |    177 -
 profile_results/real_llama3_dag_analysis.py   |    255 -
 .../real_llama3_optimizer_presolve_3d4d.log   |      7 -
 .../real_llama3_optimizer_sweep.csv           |      9 -
 .../real_llama3_optimizer_sweep.jsonl         |      8 -
 .../real_llama3_optimizer_sweep.log           |     54 -
 .../real_llama3_optimizer_sweep.py            |    351 -
 .../real_llama3_partial_presolve.csv          |      3 -
 profile_results/real_llama3_timeouts.csv      |      3 -
 qwen3_8b_autoparallel_30steps.log             |      1 -
 tests/test_dp_solver.py                       |    158 +
 tests/test_lp_relaxation.py                   |    103 +
 22 files changed, 467 insertions(+), 300365 deletions(-)
 delete mode 100644 profile_results/llama3_3b_ilp_node_indegree_distribution.svg
 delete mode 100644 profile_results/llama3_8b_4x4_strategy_full.json
 delete mode 100644 profile_results/llama3_8b_4x4_strategy_summary.json
 delete mode 100644 profile_results/real_llama3_3b_dag_node_stats.csv
 delete mode 100644 profile_results/real_llama3_3b_dag_summary.json
 delete mode 100644 profile_results/real_llama3_3b_merge_points.csv
 delete mode 100644 profile_results/real_llama3_by_mesh_dim.svg
 delete mode 100644 profile_results/real_llama3_by_model_size.svg
 delete mode 100644 profile_results/real_llama3_dag_analysis.py
 delete mode 100644 profile_results/real_llama3_optimizer_presolve_3d4d.log
 delete mode 100644 profile_results/real_llama3_optimizer_sweep.csv
 delete mode 100644 profile_results/real_llama3_optimizer_sweep.jsonl
 delete mode 100644 profile_results/real_llama3_optimizer_sweep.log
 delete mode 100644 profile_results/real_llama3_optimizer_sweep.py
 delete mode 100644 profile_results/real_llama3_partial_presolve.csv
 delete mode 100644 profile_results/real_llama3_timeouts.csv
 delete mode 120000 qwen3_8b_autoparallel_30steps.log
 create mode 100644 tests/test_dp_solver.py
 create mode 100644 tests/test_lp_relaxation.py

diff --git a/.gitignore b/.gitignore
index bcaae24d..ff4f7532 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 *.pyc
 *.pyo
 *.so
+*.log
 
 .mypy_cache/
 *.egg-info/
@@ -12,5 +13,6 @@ build/
 dist/
 tmp/
 out/
+profile_results/
 
 .vscode/
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 2b1909ee..06f2a4e6 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -203,6 +203,62 @@ class DecisionVar:
     input_spec: Any  # DTensorSpec
 
 
+@dataclass
+class LPRelaxationResult:
+    objective: float
+    status: str
+    solve_s: float
+    total_s: float
+
+
+@dataclass
+class DPTopology:
+    nodes: list[torch.fx.Node]
+    predecessors: dict[torch.fx.Node, list[torch.fx.Node]]
+    node_to_index: dict[torch.fx.Node, int]
+
+
+class DPBasedShardingSolver:
+    def __init__(self, optimizer):
+        self.optimizer = optimizer
+        self.topology: Optional[DPTopology] = None
+
+    def build_topological_order(self):
+        nodes = [node for node in self.optimizer.nodes if node.op != "output"]
+        node_to_index = {node: i for i, node in enumerate(nodes)}
+        predecessors = {}
+
+        for node in nodes:
+            node_predecessors = self.optimizer._all_input_nodes(node)
+            predecessors[node] = node_predecessors
+            node_index = node_to_index[node]
+            for pred in node_predecessors:
+                pred_index = node_to_index.get(pred)
+                if pred_index is None:
+                    raise RuntimeError(
+                        f"Predecessor {pred} for node {node} is missing from "
+                        "the DP topology"
+                    )
+                if pred_index >= node_index:
+                    raise RuntimeError(
+                        f"Predecessor {pred} for node {node} does not appear "
+                        "before it in topological order"
+                    )
+
+        self.topology = DPTopology(
+            nodes=nodes,
+            predecessors=predecessors,
+            node_to_index=node_to_index,
+        )
+        return self.topology
+
+    def get_solution(self, verbose=False):
+        raise NotImplementedError(
+            "DP-based sharding solver only builds topological order today; "
+            "strategy selection is not implemented yet."
+        )
+
+
 def _assert_has_tensor_meta(spec_or_specs, node, label):
     """Assert that all DTensorSpecs in a spec (possibly a tuple) have tensor_meta."""
     if isinstance(spec_or_specs, (list, tuple)):
@@ -224,8 +280,15 @@ def __init__(
         mesh,
         force_grad_reduce_in_higher_precision=False,
         repeated_subgraphs=False,
+        solver_backend="ilp",
     ):
         self.orig_gm = gm
+        if solver_backend not in {"ilp", "dp"}:
+            raise ValueError(
+                f"Unsupported solver_backend={solver_backend!r}; "
+                "expected 'ilp' or 'dp'"
+            )
+        self.solver_backend = solver_backend
         # The optimizer works on a concretized copy of the graph where all
         # symbolic shapes are replaced with their concrete hint values. This
         # centralizes dynamic-shape handling: the optimization pipeline
@@ -276,6 +339,37 @@ def __init__(
         get_placement_options_timer().report()
 
         self.cluster_links: dict[tuple, tuple] = {}
+        if self.solver_backend == "dp":
+            t0 = time.perf_counter()
+            self.solver = DPBasedShardingSolver(self)
+            topology = self.solver.build_topological_order()
+            t1 = time.perf_counter()
+            self.profile["dp"] = {
+                "topology_nodes": len(topology.nodes),
+                "topology_edges": sum(
+                    len(preds) for preds in topology.predecessors.values()
+                ),
+            }
+            self.profile["timings"].update(
+                {
+                    "topology_construction_s": t1 - t0,
+                    "init_total_s": t1 - t_init_start,
+                }
+            )
+            logger.info(
+                "ShardingOptimizer phase profile: phase=dp_topology "
+                "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s "
+                "topology_nodes=%s topology_edges=%s elapsed=%.3fs",
+                self.profile["mesh"]["shape"],
+                self.profile["mesh"]["dim_names"],
+                self.profile["mesh"]["size"],
+                self._format_billions(self.profile["model"]["parameter_numel"]),
+                self.profile["dp"]["topology_nodes"],
+                self.profile["dp"]["topology_edges"],
+                t1 - t0,
+            )
+            return
+
         if repeated_subgraphs:
             t = time.time()
             clusters = get_identical_regions(self.gm.graph, self.strats)
@@ -641,14 +735,19 @@ def walk_over_options(self, node, constrain_arg=None):
                 for inp_idx in range(len(strategy.redistribute_cost[argi])):
                     yield argi, out_idx, inp_idx
 
-    def _create_pulp_variables(self):
-        """Create PuLP binary variables for all decision points, resolving
-        cluster links so that identical nodes share the same variable.
+    def _create_pulp_variables(self, variable_category=pulp.LpBinary):
+        """Create PuLP variables for all decision points, resolving cluster
+        links so that identical nodes share the same variable.
 
         Returns a dict mapping root (node_idx, argi, out_idx, inp_idx) keys
         to their PuLP variables. Linked keys are not stored; use
         _get_pulp_variable() to resolve them through cluster_links.
         """
+        if variable_category not in {pulp.LpBinary, pulp.LpContinuous}:
+            raise ValueError(
+                f"Unsupported variable_category={variable_category!r}; "
+                "expected pulp.LpBinary or pulp.LpContinuous"
+            )
         cluster_linked_node_idxs = {key[0] for key in self.cluster_links}
 
         pulp_variables = {}
@@ -661,10 +760,16 @@ def _create_pulp_variables(self):
             for argi, out_idx, inp_idx in self.walk_over_options(node):
                 key = (node_idx, argi, out_idx, inp_idx)
                 root_node = self.nodes[node_idx]
+                bounds = (
+                    {"lowBound": 0, "upBound": 1}
+                    if variable_category == pulp.LpContinuous
+                    else {}
+                )
                 pulp_variables[key] = pulp.LpVariable(
                     f"n={root_node},s={node_idx},arg={argi},"
                     f"output_p={out_idx},input_p={inp_idx}",
-                    cat=pulp.LpBinary,
+                    cat=variable_category,
+                    **bounds,
                 )
 
         return pulp_variables
@@ -1133,6 +1238,97 @@ def _set_objective(self):
             terms.append(dv.var * dv.cost * multiplier)
         self.prob += pulp.lpSum(terms)
 
+    def get_lower_bound(self, verbose=False):
+        """Solve the LP relaxation and return a lower bound on the ILP objective.
+
+        This relaxes the existing binary PuLP variables to continuous variables
+        in [0, 1], solves the current problem with all constraints already added,
+        then restores the optimizer state. The result is a certificate only:
+        fractional LP values are not valid sharding placements.
+        """
+        if self.solver_backend == "dp":
+            raise NotImplementedError(
+                "LP relaxation is only available for the PuLP-backed optimizer"
+            )
+
+        t0 = time.perf_counter()
+        old_objective = self.prob.objective
+        old_status = self.prob.status
+        old_sol_status = getattr(self.prob, "sol_status", None)
+        old_selected_keys_marker = object()
+        old_selected_keys = getattr(self, "selected_keys", old_selected_keys_marker)
+        var_states = {
+            var: (var.cat, var.lowBound, var.upBound, var.varValue)
+            for var in self.pulp_variables.values()
+        }
+
+        try:
+            if self.prob.objective is None:
+                self._set_objective()
+
+            for var in self.pulp_variables.values():
+                var.cat = pulp.LpContinuous
+                var.lowBound = 0
+                var.upBound = 1
+                var.varValue = None
+
+            solver = pulp.PULP_CBC_CMD(msg=verbose)
+            t_solve0 = time.perf_counter()
+            with tempfile.TemporaryDirectory() as tmpdir:
+                solver.tmpDir = tmpdir
+                self.prob.solve(solver)
+            solve_s = time.perf_counter() - t_solve0
+
+            status = pulp.LpStatus.get(self.prob.status, self.prob.status)
+            objective = self._safe_float(pulp.value(self.prob.objective))
+            result = LPRelaxationResult(
+                objective=objective,
+                status=status,
+                solve_s=solve_s,
+                total_s=time.perf_counter() - t0,
+            )
+            self.profile["last_lp_relaxation"] = {
+                "objective": result.objective,
+                "status": result.status,
+                "solve_s": result.solve_s,
+                "total_s": result.total_s,
+            }
+            logger.info(
+                "ShardingOptimizer LP relaxation profile: "
+                "mesh_shape=%s mesh_dim_names=%s mesh_size=%s model_params=%s "
+                "unique_ilp_vars=%s constraints=%s status=%s objective=%.4f "
+                "timings={solve=%.3fs,total=%.3fs}",
+                self.profile["mesh"]["shape"],
+                self.profile["mesh"]["dim_names"],
+                self.profile["mesh"]["size"],
+                self._format_billions(self.profile["model"]["parameter_numel"]),
+                len(self.pulp_variables),
+                len(self.prob.constraints),
+                result.status,
+                result.objective,
+                result.solve_s,
+                result.total_s,
+            )
+            return result
+        finally:
+            for var, (cat, low_bound, up_bound, value) in var_states.items():
+                var.cat = cat
+                var.lowBound = low_bound
+                var.upBound = up_bound
+                var.varValue = value
+            self.prob.objective = old_objective
+            self.prob.status = old_status
+            if old_sol_status is None:
+                if hasattr(self.prob, "sol_status"):
+                    delattr(self.prob, "sol_status")
+            else:
+                self.prob.sol_status = old_sol_status
+            if old_selected_keys is old_selected_keys_marker:
+                if hasattr(self, "selected_keys"):
+                    delattr(self, "selected_keys")
+            else:
+                self.selected_keys = old_selected_keys
+
     def _solve(self, verbose=False):
         solver = pulp.PULP_CBC_CMD(msg=verbose)
         # Use a dedicated temp directory for PuLP's intermediate files (.mps,
@@ -1257,6 +1453,9 @@ def _to_concrete_solution(self, solution):
         return {self._orig_to_concrete[node]: spec for node, spec in solution.items()}
 
     def get_solution(self, verbose=False):
+        if self.solver_backend == "dp":
+            return self.solver.get_solution(verbose=verbose)
+
         t0 = time.perf_counter()
         t_objective0 = time.perf_counter()
         self._set_objective()
diff --git a/autoparallel/serialization.py b/autoparallel/serialization.py
index 1b31bab8..0dde5b69 100644
--- a/autoparallel/serialization.py
+++ b/autoparallel/serialization.py
@@ -257,6 +257,7 @@ def load_optimizer(cls, path):
     opt.strats = strats
     opt.nodes = list(strats.keys())
     opt.node_map = {node: i for i, node in enumerate(opt.nodes)}
+    opt.solver_backend = "ilp"
     opt.force_grad_reduce_in_higher_precision = save_dict[
         "force_grad_reduce_in_higher_precision"
     ]
diff --git a/profile_results/llama3_3b_ilp_node_indegree_distribution.svg b/profile_results/llama3_3b_ilp_node_indegree_distribution.svg
deleted file mode 100644
index d722fd85..00000000
--- a/profile_results/llama3_3b_ilp_node_indegree_distribution.svg
+++ /dev/null
@@ -1,51 +0,0 @@
-<svg xmlns="http://www.w3.org/2000/svg" width="980" height="620" viewBox="0 0 980 620">
-<rect width="100%" height="100%" fill="#ffffff"/>
-<style>
-text { font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; fill: #111827; }
-.title { font-size: 24px; font-weight: 700; }
-.subtitle { font-size: 13px; fill: #4b5563; }
-.axis { stroke: #111827; stroke-width: 1.2; }
-.grid { stroke: #e5e7eb; stroke-width: 1; }
-.tick { font-size: 12px; fill: #374151; }
-.label { font-size: 13px; font-weight: 600; }
-.note { font-size: 12px; fill: #6b7280; }
-</style>
-<text class="title" x="92" y="42">AutoParallel ILP Node In-Degree Distribution</text>
-<text class="subtitle" x="92" y="66">LLaMA3 3B, mesh=(64,), repeated_subgraphs=True; raw optimizer DAG, no manual cluster collapse</text>
-<text class="subtitle" x="92" y="84">Nodes excluding output: 7199; unique direct dependency edges: 8805</text>
-<line class="grid" x1="92" y1="524.0" x2="938" y2="524.0"/>
-<text class="tick" x="80" y="528.0" text-anchor="end">1</text>
-<line class="grid" x1="92" y1="419.0" x2="938" y2="419.0"/>
-<text class="tick" x="80" y="423.0" text-anchor="end">10</text>
-<line class="grid" x1="92" y1="314.0" x2="938" y2="314.0"/>
-<text class="tick" x="80" y="318.0" text-anchor="end">100</text>
-<line class="grid" x1="92" y1="209.0" x2="938" y2="209.0"/>
-<text class="tick" x="80" y="213.0" text-anchor="end">1000</text>
-<line class="grid" x1="92" y1="104.0" x2="938" y2="104.0"/>
-<text class="tick" x="80" y="108.0" text-anchor="end">10000</text>
-<line class="axis" x1="92" y1="524" x2="938" y2="524"/>
-<line class="axis" x1="92" y1="104" x2="92" y2="524"/>
-<text class="tick" x="515.0" y="586" text-anchor="middle">direct dependency nodes / in-degree</text>
-<text class="tick" transform="translate(26 314.0) rotate(-90)" text-anchor="middle">node count, log scale</text>
-<rect x="140.2" y="271.0" width="92.0" height="253.0" rx="4" fill="#64748b"/>
-<text class="label" x="186.2" y="247.0" text-anchor="middle">257</text>
-<text class="note" x="186.2" y="263.0" text-anchor="middle">3.57%</text>
-<text class="tick" x="186.2" y="550.0" text-anchor="middle">0</text>
-<rect x="328.8" y="133.2" width="92.0" height="390.8" rx="4" fill="#2563eb"/>
-<text class="label" x="374.8" y="109.2" text-anchor="middle">5275</text>
-<text class="note" x="374.8" y="125.2" text-anchor="middle">73.27%</text>
-<text class="tick" x="374.8" y="550.0" text-anchor="middle">1</text>
-<rect x="517.2" y="187.3" width="92.0" height="336.7" rx="4" fill="#0f766e"/>
-<text class="label" x="563.2" y="163.3" text-anchor="middle">1611</text>
-<text class="note" x="563.2" y="179.3" text-anchor="middle">22.38%</text>
-<text class="tick" x="563.2" y="550.0" text-anchor="middle">2</text>
-<rect x="705.8" y="372.0" width="92.0" height="152.0" rx="4" fill="#d97706"/>
-<text class="label" x="751.8" y="348.0" text-anchor="middle">28</text>
-<text class="note" x="751.8" y="364.0" text-anchor="middle">0.39%</text>
-<text class="tick" x="751.8" y="550.0" text-anchor="middle">3</text>
-<rect x="894.2" y="372.0" width="92.0" height="152.0" rx="4" fill="#dc2626"/>
-<text class="label" x="940.2" y="348.0" text-anchor="middle">28</text>
-<text class="note" x="940.2" y="364.0" text-anchor="middle">0.39%</text>
-<text class="tick" x="940.2" y="550.0" text-anchor="middle">8</text>
-<text class="note" x="92" y="606">Histogram: 0-&gt;257, 1-&gt;5275, 2-&gt;1611, 3-&gt;28, 8-&gt;28</text>
-</svg>
\ No newline at end of file
diff --git a/profile_results/llama3_8b_4x4_strategy_full.json b/profile_results/llama3_8b_4x4_strategy_full.json
deleted file mode 100644
index 88f58ae3..00000000
--- a/profile_results/llama3_8b_4x4_strategy_full.json
+++ /dev/null
@@ -1,287470 +0,0 @@
-{
-  "mesh": {
-    "dim_names": [
-      "dp",
-      "tp"
-    ],
-    "shape": [
-      4,
-      4
-    ]
-  },
-  "nodes": [
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "tok_embeddings.weight",
-      "name": "primals_1",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(1)S(1)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.0.attention.wq.weight",
-      "name": "primals_2",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.0.attention.wk.weight",
-      "name": "primals_3",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.0.attention.wv.weight",
-      "name": "primals_4",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.0.attention.wo.weight",
-      "name": "primals_5",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.0.feed_forward.w1.weight",
-      "name": "primals_6",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.0.feed_forward.w2.weight",
-      "name": "primals_7",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.0.feed_forward.w3.weight",
-      "name": "primals_8",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.0.attention_norm.weight",
-      "name": "primals_9",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.0.ffn_norm.weight",
-      "name": "primals_10",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.1.attention.wq.weight",
-      "name": "primals_11",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.1.attention.wk.weight",
-      "name": "primals_12",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.1.attention.wv.weight",
-      "name": "primals_13",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.1.attention.wo.weight",
-      "name": "primals_14",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.1.feed_forward.w1.weight",
-      "name": "primals_15",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.1.feed_forward.w2.weight",
-      "name": "primals_16",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.1.feed_forward.w3.weight",
-      "name": "primals_17",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.1.attention_norm.weight",
-      "name": "primals_18",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.1.ffn_norm.weight",
-      "name": "primals_19",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.2.attention.wq.weight",
-      "name": "primals_20",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.2.attention.wk.weight",
-      "name": "primals_21",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.2.attention.wv.weight",
-      "name": "primals_22",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.2.attention.wo.weight",
-      "name": "primals_23",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.2.feed_forward.w1.weight",
-      "name": "primals_24",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.2.feed_forward.w2.weight",
-      "name": "primals_25",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.2.feed_forward.w3.weight",
-      "name": "primals_26",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.2.attention_norm.weight",
-      "name": "primals_27",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.2.ffn_norm.weight",
-      "name": "primals_28",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.3.attention.wq.weight",
-      "name": "primals_29",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.3.attention.wk.weight",
-      "name": "primals_30",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.3.attention.wv.weight",
-      "name": "primals_31",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.3.attention.wo.weight",
-      "name": "primals_32",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.3.feed_forward.w1.weight",
-      "name": "primals_33",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.3.feed_forward.w2.weight",
-      "name": "primals_34",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.3.feed_forward.w3.weight",
-      "name": "primals_35",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.3.attention_norm.weight",
-      "name": "primals_36",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.3.ffn_norm.weight",
-      "name": "primals_37",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.4.attention.wq.weight",
-      "name": "primals_38",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.4.attention.wk.weight",
-      "name": "primals_39",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.4.attention.wv.weight",
-      "name": "primals_40",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.4.attention.wo.weight",
-      "name": "primals_41",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.4.feed_forward.w1.weight",
-      "name": "primals_42",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.4.feed_forward.w2.weight",
-      "name": "primals_43",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.4.feed_forward.w3.weight",
-      "name": "primals_44",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.4.attention_norm.weight",
-      "name": "primals_45",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.4.ffn_norm.weight",
-      "name": "primals_46",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.5.attention.wq.weight",
-      "name": "primals_47",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.5.attention.wk.weight",
-      "name": "primals_48",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.5.attention.wv.weight",
-      "name": "primals_49",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.5.attention.wo.weight",
-      "name": "primals_50",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.5.feed_forward.w1.weight",
-      "name": "primals_51",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.5.feed_forward.w2.weight",
-      "name": "primals_52",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.5.feed_forward.w3.weight",
-      "name": "primals_53",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.5.attention_norm.weight",
-      "name": "primals_54",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.5.ffn_norm.weight",
-      "name": "primals_55",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.6.attention.wq.weight",
-      "name": "primals_56",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.6.attention.wk.weight",
-      "name": "primals_57",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.6.attention.wv.weight",
-      "name": "primals_58",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.6.attention.wo.weight",
-      "name": "primals_59",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.6.feed_forward.w1.weight",
-      "name": "primals_60",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.6.feed_forward.w2.weight",
-      "name": "primals_61",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.6.feed_forward.w3.weight",
-      "name": "primals_62",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.6.attention_norm.weight",
-      "name": "primals_63",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.6.ffn_norm.weight",
-      "name": "primals_64",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.7.attention.wq.weight",
-      "name": "primals_65",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.7.attention.wk.weight",
-      "name": "primals_66",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.7.attention.wv.weight",
-      "name": "primals_67",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.7.attention.wo.weight",
-      "name": "primals_68",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.7.feed_forward.w1.weight",
-      "name": "primals_69",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.7.feed_forward.w2.weight",
-      "name": "primals_70",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.7.feed_forward.w3.weight",
-      "name": "primals_71",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.7.attention_norm.weight",
-      "name": "primals_72",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.7.ffn_norm.weight",
-      "name": "primals_73",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.8.attention.wq.weight",
-      "name": "primals_74",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.8.attention.wk.weight",
-      "name": "primals_75",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.8.attention.wv.weight",
-      "name": "primals_76",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.8.attention.wo.weight",
-      "name": "primals_77",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.8.feed_forward.w1.weight",
-      "name": "primals_78",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.8.feed_forward.w2.weight",
-      "name": "primals_79",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.8.feed_forward.w3.weight",
-      "name": "primals_80",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.8.attention_norm.weight",
-      "name": "primals_81",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.8.ffn_norm.weight",
-      "name": "primals_82",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.9.attention.wq.weight",
-      "name": "primals_83",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.9.attention.wk.weight",
-      "name": "primals_84",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.9.attention.wv.weight",
-      "name": "primals_85",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.9.attention.wo.weight",
-      "name": "primals_86",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.9.feed_forward.w1.weight",
-      "name": "primals_87",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.9.feed_forward.w2.weight",
-      "name": "primals_88",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.9.feed_forward.w3.weight",
-      "name": "primals_89",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.9.attention_norm.weight",
-      "name": "primals_90",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.9.ffn_norm.weight",
-      "name": "primals_91",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.10.attention.wq.weight",
-      "name": "primals_92",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.10.attention.wk.weight",
-      "name": "primals_93",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.10.attention.wv.weight",
-      "name": "primals_94",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.10.attention.wo.weight",
-      "name": "primals_95",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.10.feed_forward.w1.weight",
-      "name": "primals_96",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.10.feed_forward.w2.weight",
-      "name": "primals_97",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.10.feed_forward.w3.weight",
-      "name": "primals_98",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.10.attention_norm.weight",
-      "name": "primals_99",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.10.ffn_norm.weight",
-      "name": "primals_100",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.11.attention.wq.weight",
-      "name": "primals_101",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.11.attention.wk.weight",
-      "name": "primals_102",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.11.attention.wv.weight",
-      "name": "primals_103",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.11.attention.wo.weight",
-      "name": "primals_104",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.11.feed_forward.w1.weight",
-      "name": "primals_105",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.11.feed_forward.w2.weight",
-      "name": "primals_106",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.11.feed_forward.w3.weight",
-      "name": "primals_107",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.11.attention_norm.weight",
-      "name": "primals_108",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.11.ffn_norm.weight",
-      "name": "primals_109",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.12.attention.wq.weight",
-      "name": "primals_110",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.12.attention.wk.weight",
-      "name": "primals_111",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.12.attention.wv.weight",
-      "name": "primals_112",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.12.attention.wo.weight",
-      "name": "primals_113",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.12.feed_forward.w1.weight",
-      "name": "primals_114",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.12.feed_forward.w2.weight",
-      "name": "primals_115",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.12.feed_forward.w3.weight",
-      "name": "primals_116",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.12.attention_norm.weight",
-      "name": "primals_117",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.12.ffn_norm.weight",
-      "name": "primals_118",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.13.attention.wq.weight",
-      "name": "primals_119",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.13.attention.wk.weight",
-      "name": "primals_120",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.13.attention.wv.weight",
-      "name": "primals_121",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.13.attention.wo.weight",
-      "name": "primals_122",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.13.feed_forward.w1.weight",
-      "name": "primals_123",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.13.feed_forward.w2.weight",
-      "name": "primals_124",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.13.feed_forward.w3.weight",
-      "name": "primals_125",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.13.attention_norm.weight",
-      "name": "primals_126",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.13.ffn_norm.weight",
-      "name": "primals_127",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.14.attention.wq.weight",
-      "name": "primals_128",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.14.attention.wk.weight",
-      "name": "primals_129",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.14.attention.wv.weight",
-      "name": "primals_130",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.14.attention.wo.weight",
-      "name": "primals_131",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.14.feed_forward.w1.weight",
-      "name": "primals_132",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.14.feed_forward.w2.weight",
-      "name": "primals_133",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.14.feed_forward.w3.weight",
-      "name": "primals_134",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.14.attention_norm.weight",
-      "name": "primals_135",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.14.ffn_norm.weight",
-      "name": "primals_136",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.15.attention.wq.weight",
-      "name": "primals_137",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.15.attention.wk.weight",
-      "name": "primals_138",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.15.attention.wv.weight",
-      "name": "primals_139",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.15.attention.wo.weight",
-      "name": "primals_140",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.15.feed_forward.w1.weight",
-      "name": "primals_141",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.15.feed_forward.w2.weight",
-      "name": "primals_142",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.15.feed_forward.w3.weight",
-      "name": "primals_143",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.15.attention_norm.weight",
-      "name": "primals_144",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.15.ffn_norm.weight",
-      "name": "primals_145",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.16.attention.wq.weight",
-      "name": "primals_146",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.16.attention.wk.weight",
-      "name": "primals_147",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.16.attention.wv.weight",
-      "name": "primals_148",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.16.attention.wo.weight",
-      "name": "primals_149",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.16.feed_forward.w1.weight",
-      "name": "primals_150",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.16.feed_forward.w2.weight",
-      "name": "primals_151",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.16.feed_forward.w3.weight",
-      "name": "primals_152",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.16.attention_norm.weight",
-      "name": "primals_153",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.16.ffn_norm.weight",
-      "name": "primals_154",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.17.attention.wq.weight",
-      "name": "primals_155",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.17.attention.wk.weight",
-      "name": "primals_156",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.17.attention.wv.weight",
-      "name": "primals_157",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.17.attention.wo.weight",
-      "name": "primals_158",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.17.feed_forward.w1.weight",
-      "name": "primals_159",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.17.feed_forward.w2.weight",
-      "name": "primals_160",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.17.feed_forward.w3.weight",
-      "name": "primals_161",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.17.attention_norm.weight",
-      "name": "primals_162",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.17.ffn_norm.weight",
-      "name": "primals_163",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.18.attention.wq.weight",
-      "name": "primals_164",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.18.attention.wk.weight",
-      "name": "primals_165",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.18.attention.wv.weight",
-      "name": "primals_166",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.18.attention.wo.weight",
-      "name": "primals_167",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.18.feed_forward.w1.weight",
-      "name": "primals_168",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.18.feed_forward.w2.weight",
-      "name": "primals_169",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.18.feed_forward.w3.weight",
-      "name": "primals_170",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.18.attention_norm.weight",
-      "name": "primals_171",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.18.ffn_norm.weight",
-      "name": "primals_172",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.19.attention.wq.weight",
-      "name": "primals_173",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.19.attention.wk.weight",
-      "name": "primals_174",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.19.attention.wv.weight",
-      "name": "primals_175",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.19.attention.wo.weight",
-      "name": "primals_176",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.19.feed_forward.w1.weight",
-      "name": "primals_177",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.19.feed_forward.w2.weight",
-      "name": "primals_178",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.19.feed_forward.w3.weight",
-      "name": "primals_179",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.19.attention_norm.weight",
-      "name": "primals_180",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.19.ffn_norm.weight",
-      "name": "primals_181",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.20.attention.wq.weight",
-      "name": "primals_182",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.20.attention.wk.weight",
-      "name": "primals_183",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.20.attention.wv.weight",
-      "name": "primals_184",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.20.attention.wo.weight",
-      "name": "primals_185",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.20.feed_forward.w1.weight",
-      "name": "primals_186",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.20.feed_forward.w2.weight",
-      "name": "primals_187",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.20.feed_forward.w3.weight",
-      "name": "primals_188",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.20.attention_norm.weight",
-      "name": "primals_189",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.20.ffn_norm.weight",
-      "name": "primals_190",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.21.attention.wq.weight",
-      "name": "primals_191",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.21.attention.wk.weight",
-      "name": "primals_192",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.21.attention.wv.weight",
-      "name": "primals_193",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.21.attention.wo.weight",
-      "name": "primals_194",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.21.feed_forward.w1.weight",
-      "name": "primals_195",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.21.feed_forward.w2.weight",
-      "name": "primals_196",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.21.feed_forward.w3.weight",
-      "name": "primals_197",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.21.attention_norm.weight",
-      "name": "primals_198",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.21.ffn_norm.weight",
-      "name": "primals_199",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.22.attention.wq.weight",
-      "name": "primals_200",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.22.attention.wk.weight",
-      "name": "primals_201",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.22.attention.wv.weight",
-      "name": "primals_202",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.22.attention.wo.weight",
-      "name": "primals_203",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.22.feed_forward.w1.weight",
-      "name": "primals_204",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.22.feed_forward.w2.weight",
-      "name": "primals_205",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.22.feed_forward.w3.weight",
-      "name": "primals_206",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.22.attention_norm.weight",
-      "name": "primals_207",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.22.ffn_norm.weight",
-      "name": "primals_208",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.23.attention.wq.weight",
-      "name": "primals_209",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.23.attention.wk.weight",
-      "name": "primals_210",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.23.attention.wv.weight",
-      "name": "primals_211",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.23.attention.wo.weight",
-      "name": "primals_212",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.23.feed_forward.w1.weight",
-      "name": "primals_213",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.23.feed_forward.w2.weight",
-      "name": "primals_214",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.23.feed_forward.w3.weight",
-      "name": "primals_215",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.23.attention_norm.weight",
-      "name": "primals_216",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.23.ffn_norm.weight",
-      "name": "primals_217",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.24.attention.wq.weight",
-      "name": "primals_218",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.24.attention.wk.weight",
-      "name": "primals_219",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.24.attention.wv.weight",
-      "name": "primals_220",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.24.attention.wo.weight",
-      "name": "primals_221",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.24.feed_forward.w1.weight",
-      "name": "primals_222",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.24.feed_forward.w2.weight",
-      "name": "primals_223",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.24.feed_forward.w3.weight",
-      "name": "primals_224",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.24.attention_norm.weight",
-      "name": "primals_225",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.24.ffn_norm.weight",
-      "name": "primals_226",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.25.attention.wq.weight",
-      "name": "primals_227",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.25.attention.wk.weight",
-      "name": "primals_228",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.25.attention.wv.weight",
-      "name": "primals_229",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.25.attention.wo.weight",
-      "name": "primals_230",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.25.feed_forward.w1.weight",
-      "name": "primals_231",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.25.feed_forward.w2.weight",
-      "name": "primals_232",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.25.feed_forward.w3.weight",
-      "name": "primals_233",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.25.attention_norm.weight",
-      "name": "primals_234",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.25.ffn_norm.weight",
-      "name": "primals_235",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.26.attention.wq.weight",
-      "name": "primals_236",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.26.attention.wk.weight",
-      "name": "primals_237",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.26.attention.wv.weight",
-      "name": "primals_238",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.26.attention.wo.weight",
-      "name": "primals_239",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.26.feed_forward.w1.weight",
-      "name": "primals_240",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.26.feed_forward.w2.weight",
-      "name": "primals_241",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.26.feed_forward.w3.weight",
-      "name": "primals_242",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.26.attention_norm.weight",
-      "name": "primals_243",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.26.ffn_norm.weight",
-      "name": "primals_244",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.27.attention.wq.weight",
-      "name": "primals_245",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.27.attention.wk.weight",
-      "name": "primals_246",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.27.attention.wv.weight",
-      "name": "primals_247",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.27.attention.wo.weight",
-      "name": "primals_248",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.27.feed_forward.w1.weight",
-      "name": "primals_249",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.27.feed_forward.w2.weight",
-      "name": "primals_250",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.27.feed_forward.w3.weight",
-      "name": "primals_251",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.27.attention_norm.weight",
-      "name": "primals_252",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.27.ffn_norm.weight",
-      "name": "primals_253",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.28.attention.wq.weight",
-      "name": "primals_254",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.28.attention.wk.weight",
-      "name": "primals_255",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.28.attention.wv.weight",
-      "name": "primals_256",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.28.attention.wo.weight",
-      "name": "primals_257",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.28.feed_forward.w1.weight",
-      "name": "primals_258",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.28.feed_forward.w2.weight",
-      "name": "primals_259",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.28.feed_forward.w3.weight",
-      "name": "primals_260",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.28.attention_norm.weight",
-      "name": "primals_261",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.28.ffn_norm.weight",
-      "name": "primals_262",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.29.attention.wq.weight",
-      "name": "primals_263",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.29.attention.wk.weight",
-      "name": "primals_264",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.29.attention.wv.weight",
-      "name": "primals_265",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.29.attention.wo.weight",
-      "name": "primals_266",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.29.feed_forward.w1.weight",
-      "name": "primals_267",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.29.feed_forward.w2.weight",
-      "name": "primals_268",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.29.feed_forward.w3.weight",
-      "name": "primals_269",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.29.attention_norm.weight",
-      "name": "primals_270",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.29.ffn_norm.weight",
-      "name": "primals_271",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.30.attention.wq.weight",
-      "name": "primals_272",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.30.attention.wk.weight",
-      "name": "primals_273",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.30.attention.wv.weight",
-      "name": "primals_274",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.30.attention.wo.weight",
-      "name": "primals_275",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.30.feed_forward.w1.weight",
-      "name": "primals_276",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.30.feed_forward.w2.weight",
-      "name": "primals_277",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.30.feed_forward.w3.weight",
-      "name": "primals_278",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.30.attention_norm.weight",
-      "name": "primals_279",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.30.ffn_norm.weight",
-      "name": "primals_280",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.31.attention.wq.weight",
-      "name": "primals_281",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.31.attention.wk.weight",
-      "name": "primals_282",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.31.attention.wv.weight",
-      "name": "primals_283",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.31.attention.wo.weight",
-      "name": "primals_284",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.31.feed_forward.w1.weight",
-      "name": "primals_285",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.31.feed_forward.w2.weight",
-      "name": "primals_286",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.31.feed_forward.w3.weight",
-      "name": "primals_287",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.31.attention_norm.weight",
-      "name": "primals_288",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "layers.31.ffn_norm.weight",
-      "name": "primals_289",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "norm.weight",
-      "name": "primals_290",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [],
-      "module_path": "output.weight",
-      "name": "primals_291",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "param",
-      "placement": "S(0)S(0)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [],
-      "module_path": "freqs_cis",
-      "name": "primals_292",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "buffer",
-      "placement": "RR",
-      "shape": [
-        8192,
-        64
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "int64",
-      "inputs": [],
-      "name": "primals_293",
-      "op": "placeholder",
-      "phase": "forward",
-      "placeholder_kind": "input",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [],
-      "name": "tangents_1",
-      "op": "placeholder",
-      "phase": "backward",
-      "placeholder_kind": "tangent",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        128256
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 76.40578345195063,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(1)S(1)",
-          "name": "primals_1",
-          "src_placement": "S(1)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].tok_embeddings",
-      "name": "dtype_cast",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(1)S(1)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "int64",
-      "inputs": [
-        {
-          "comm_cost": 21.38246153846154,
-          "dst_placement": "RR",
-          "name": "primals_293",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].tok_embeddings",
-      "name": "alias_default_1",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        8,
-        8192
-      ],
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 38.685829146330285,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(1)S(1)",
-          "name": "dtype_cast",
-          "src_placement": "S(1)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_1",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].tok_embeddings",
-      "name": "embedding",
-      "op": "aten.embedding.default",
-      "phase": "forward",
-      "placement": "S(2)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 539
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 0,
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_9",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "dtype_cast_1",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 706.2108351658422,
-          "dst_placement": "S(0)S(1)",
-          "name": "embedding",
-          "src_placement": "S(2)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].tok_embeddings",
-      "name": "alias_default_3",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 539
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 1,
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "convert_element_type",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "alias_default_5",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "pow_1",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "mean",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "add",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "rsqrt",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "alias_default_6",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "mul",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_1",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "alias_default_4",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_4",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "mul_1",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "convert_element_type_1",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_2",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wq",
-      "name": "dtype_cast_2",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_2",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wq",
-      "name": "permute",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "alias_default_7",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wq",
-      "name": "alias_default_8",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_7",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_8",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wq",
-      "name": "einsum_default",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_3",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wk",
-      "name": "dtype_cast_3",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_3",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wk",
-      "name": "permute_1",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wk",
-      "name": "alias_default_9",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_7",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_9",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wk",
-      "name": "einsum_default_1",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_4",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wv",
-      "name": "dtype_cast_4",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_4",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wv",
-      "name": "permute_2",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_2",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wv",
-      "name": "alias_default_10",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_7",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_10",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wv",
-      "name": "einsum_default_2",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_6",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_7",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_8",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "convert_element_type_8",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_9",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_as_complex",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "convert_element_type_9",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_10",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_as_complex_1",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "primals_292",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "name": "alias_default",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        8192,
-        64
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_11",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_11",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "alias_default_11",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_11",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "mul_2",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_as_real",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_12",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_11",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "mul_3",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_as_real_1",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_13",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "convert_element_type_10",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "convert_element_type_11",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "unsqueeze",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "expand",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "clone",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_14",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "unsqueeze_1",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "expand_1",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "clone_1",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_15",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "permute_3",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "permute_4",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "permute_5",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "alias_default_12",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "alias_default_13",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "alias_default_14",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.sdpa",
-      "name": "getitem",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.sdpa",
-      "name": "getitem_1",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.sdpa",
-      "name": "getitem_6",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.sdpa",
-      "name": "getitem_7",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.sdpa",
-      "name": "alias_default_15",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "permute_6",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_16",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_5",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "dtype_cast_5",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_5",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "permute_7",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "alias_default_16",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_7",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "alias_default_17",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_17",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "einsum_default_3",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0",
-      "name": "add_1",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_10",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "dtype_cast_6",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0",
-      "name": "alias_default_18",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "convert_element_type_14",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "alias_default_20",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "pow_2",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_2",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "mean_1",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "add_2",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_2",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "rsqrt_1",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "alias_default_21",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "mul_4",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_6",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "alias_default_19",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_19",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "mul_5",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "convert_element_type_15",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_6",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "dtype_cast_7",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_7",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "permute_8",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "alias_default_22",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_8",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "alias_default_23",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_22",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_23",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "einsum_default_4",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "alias_default_24",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "convert_element_type_18",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "alias_default_25",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "neg",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "exp",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "add_3",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "div",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "convert_element_type_19",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_8",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "dtype_cast_8",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_8",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "permute_9",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_9",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "alias_default_27",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_22",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_27",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "einsum_default_5",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "alias_default_26",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "alias_default_28",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "mul_6",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "dtype_cast_9",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "permute_10",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "alias_default_29",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_10",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "alias_default_30",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_30",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "einsum_default_6",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_6",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0",
-      "name": "add_4",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_18",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "dtype_cast_10",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0",
-      "name": "alias_default_31",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "convert_element_type_24",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "alias_default_33",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_33",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "pow_3",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "mean_2",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_2",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "add_5",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "rsqrt_2",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_2",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "alias_default_34",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_33",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_34",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "mul_7",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_10",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "alias_default_32",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_32",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "mul_8",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_8",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "convert_element_type_25",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_11",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wq",
-      "name": "dtype_cast_11",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_11",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wq",
-      "name": "permute_11",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "alias_default_35",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_11",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wq",
-      "name": "alias_default_36",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_35",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_36",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wq",
-      "name": "einsum_default_7",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_12",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wk",
-      "name": "dtype_cast_12",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_12",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wk",
-      "name": "permute_12",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_12",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wk",
-      "name": "alias_default_37",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_35",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_37",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wk",
-      "name": "einsum_default_8",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_13",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wv",
-      "name": "dtype_cast_13",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_13",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wv",
-      "name": "permute_13",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_13",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wv",
-      "name": "alias_default_38",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_35",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_38",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wv",
-      "name": "einsum_default_9",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_31",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_32",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_33",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "convert_element_type_32",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_34",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_as_complex_2",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "convert_element_type_33",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_35",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_as_complex_3",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_36",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_36",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "alias_default_39",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_39",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "mul_9",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_as_real_2",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_37",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_39",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "mul_10",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_as_real_3",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_38",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "convert_element_type_34",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "convert_element_type_35",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "unsqueeze_2",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "expand_2",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "clone_2",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_39",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "unsqueeze_3",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "expand_3",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "clone_3",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_40",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "permute_14",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "permute_15",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "permute_16",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "alias_default_40",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "alias_default_41",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "alias_default_42",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_40",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_41",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_42",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_1",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.sdpa",
-      "name": "getitem_9",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.sdpa",
-      "name": "getitem_10",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_1",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.sdpa",
-      "name": "getitem_15",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_1",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.sdpa",
-      "name": "getitem_16",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.sdpa",
-      "name": "alias_default_43",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_43",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "permute_17",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_41",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_14",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "dtype_cast_14",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_14",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "permute_18",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_41",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "alias_default_44",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_18",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "alias_default_45",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_44",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_45",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "einsum_default_10",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1",
-      "name": "add_6",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_19",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "dtype_cast_15",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1",
-      "name": "alias_default_46",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_46",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "convert_element_type_38",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_38",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "alias_default_48",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_48",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "pow_4",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "mean_3",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "add_7",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "rsqrt_3",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "alias_default_49",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_48",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "mul_11",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_15",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "alias_default_47",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_47",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "mul_12",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "convert_element_type_39",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_15",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "dtype_cast_16",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_16",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "permute_19",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_39",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "alias_default_50",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_19",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "alias_default_51",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_50",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_51",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "einsum_default_11",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "alias_default_52",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "convert_element_type_42",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "alias_default_53",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "neg_1",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "exp_1",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "add_8",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "div_1",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "convert_element_type_43",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_17",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "dtype_cast_17",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_17",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "permute_20",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_20",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "alias_default_55",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_50",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_55",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "einsum_default_12",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "alias_default_54",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "alias_default_56",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "mul_13",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "dtype_cast_18",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "permute_21",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "alias_default_57",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_21",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "alias_default_58",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_58",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "einsum_default_13",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_46",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_13",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1",
-      "name": "add_9",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_27",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "dtype_cast_19",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1",
-      "name": "alias_default_59",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "convert_element_type_48",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_48",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "alias_default_61",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_61",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "pow_5",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "mean_4",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "add_10",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "rsqrt_4",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "alias_default_62",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_61",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_62",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "mul_14",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_19",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "alias_default_60",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_60",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "mul_15",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "convert_element_type_49",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_20",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wq",
-      "name": "dtype_cast_20",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_20",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wq",
-      "name": "permute_22",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "alias_default_63",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_22",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wq",
-      "name": "alias_default_64",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_63",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_64",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wq",
-      "name": "einsum_default_14",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_21",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wk",
-      "name": "dtype_cast_21",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_21",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wk",
-      "name": "permute_23",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_23",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wk",
-      "name": "alias_default_65",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_63",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_65",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wk",
-      "name": "einsum_default_15",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_22",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wv",
-      "name": "dtype_cast_22",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_22",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wv",
-      "name": "permute_24",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_24",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wv",
-      "name": "alias_default_66",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_63",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_66",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wv",
-      "name": "einsum_default_16",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_56",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_57",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_58",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "convert_element_type_56",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_59",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_as_complex_4",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "convert_element_type_57",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_60",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_as_complex_5",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_61",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_61",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "alias_default_67",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_67",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "mul_16",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_as_real_4",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_62",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_67",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "mul_17",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_as_real_5",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_63",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_62",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "convert_element_type_58",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "convert_element_type_59",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "unsqueeze_4",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "expand_4",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "clone_4",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_64",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "unsqueeze_5",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "expand_5",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "clone_5",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_65",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "permute_25",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_64",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "permute_26",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_65",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "permute_27",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "alias_default_68",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "alias_default_69",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "alias_default_70",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_68",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_69",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_70",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_2",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_2",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.sdpa",
-      "name": "getitem_18",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_2",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.sdpa",
-      "name": "getitem_19",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_2",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.sdpa",
-      "name": "getitem_24",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_2",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.sdpa",
-      "name": "getitem_25",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.sdpa",
-      "name": "alias_default_71",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_71",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "permute_28",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_66",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_23",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "dtype_cast_23",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_23",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "permute_29",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_66",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "alias_default_72",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_29",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "alias_default_73",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_72",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_73",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "einsum_default_17",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2",
-      "name": "add_11",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_28",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "dtype_cast_24",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2",
-      "name": "alias_default_74",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_74",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "convert_element_type_62",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_62",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "alias_default_76",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_76",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "pow_6",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "mean_5",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "add_12",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "rsqrt_5",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "alias_default_77",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_76",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_77",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "mul_18",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_24",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "alias_default_75",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_75",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "mul_19",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "convert_element_type_63",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_24",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "dtype_cast_25",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_25",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "permute_30",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_63",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "alias_default_78",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_30",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "alias_default_79",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_78",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_79",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "einsum_default_18",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "alias_default_80",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_80",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "convert_element_type_66",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_66",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "alias_default_81",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_81",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "neg_2",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "exp_2",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "add_13",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_81",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "div_2",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "convert_element_type_67",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_26",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "dtype_cast_26",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_26",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "permute_31",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_31",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "alias_default_83",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_78",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_83",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "einsum_default_19",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_67",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "alias_default_82",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "alias_default_84",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_82",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_84",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "mul_20",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "dtype_cast_27",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "permute_32",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "alias_default_85",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_32",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "alias_default_86",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_85",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_86",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "einsum_default_20",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_74",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_20",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2",
-      "name": "add_14",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_36",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "dtype_cast_28",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2",
-      "name": "alias_default_87",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_87",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "convert_element_type_72",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_72",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "alias_default_89",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_89",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "pow_7",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "mean_6",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "add_15",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "rsqrt_6",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "alias_default_90",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_89",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_90",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "mul_21",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_28",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "alias_default_88",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_88",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "mul_22",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_22",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "convert_element_type_73",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_29",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wq",
-      "name": "dtype_cast_29",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_29",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wq",
-      "name": "permute_33",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_73",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "alias_default_91",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_33",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wq",
-      "name": "alias_default_92",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_91",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_92",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wq",
-      "name": "einsum_default_21",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_30",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wk",
-      "name": "dtype_cast_30",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_30",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wk",
-      "name": "permute_34",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_34",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wk",
-      "name": "alias_default_93",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_91",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_93",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wk",
-      "name": "einsum_default_22",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_31",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wv",
-      "name": "dtype_cast_31",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_31",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wv",
-      "name": "permute_35",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_35",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wv",
-      "name": "alias_default_94",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_91",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_94",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wv",
-      "name": "einsum_default_23",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_81",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_82",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_83",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_81",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "convert_element_type_80",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_80",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_84",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_84",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_as_complex_6",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_82",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "convert_element_type_81",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_81",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_85",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_85",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_as_complex_7",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_86",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_86",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "alias_default_95",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_95",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "mul_23",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_as_real_6",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_87",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_95",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "mul_24",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_as_real_7",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_88",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_87",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "convert_element_type_82",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_88",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "convert_element_type_83",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_83",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "unsqueeze_6",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "expand_6",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "clone_6",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_89",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_83",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "unsqueeze_7",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "expand_7",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "clone_7",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_90",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_82",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "permute_36",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_89",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "permute_37",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_90",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "permute_38",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_36",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "alias_default_96",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_37",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "alias_default_97",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_38",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "alias_default_98",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_96",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_97",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_98",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_3",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.sdpa",
-      "name": "getitem_27",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.sdpa",
-      "name": "getitem_28",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_3",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.sdpa",
-      "name": "getitem_33",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_3",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.sdpa",
-      "name": "getitem_34",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.sdpa",
-      "name": "alias_default_99",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_99",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "permute_39",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_91",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_32",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "dtype_cast_32",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_32",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "permute_40",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_91",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "alias_default_100",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_40",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "alias_default_101",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_100",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_101",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "einsum_default_24",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_87",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3",
-      "name": "add_16",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_37",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "dtype_cast_33",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3",
-      "name": "alias_default_102",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_102",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "convert_element_type_86",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_86",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "alias_default_104",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_104",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "pow_8",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_8",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "mean_7",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "add_17",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "rsqrt_7",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "alias_default_105",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_104",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_105",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "mul_25",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_33",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "alias_default_103",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_103",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "mul_26",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "convert_element_type_87",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_33",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "dtype_cast_34",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_34",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "permute_41",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_87",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "alias_default_106",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_41",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "alias_default_107",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_106",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_107",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "einsum_default_25",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "alias_default_108",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_108",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "convert_element_type_90",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_90",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "alias_default_109",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_109",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "neg_3",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "exp_3",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "add_18",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_109",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "div_3",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "convert_element_type_91",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_35",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "dtype_cast_35",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_35",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "permute_42",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_42",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "alias_default_111",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_106",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_111",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "einsum_default_26",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_91",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "alias_default_110",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "alias_default_112",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_110",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_112",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "mul_27",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_34",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "dtype_cast_36",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_36",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "permute_43",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "alias_default_113",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_43",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "alias_default_114",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_113",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_114",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "einsum_default_27",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_102",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_27",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3",
-      "name": "add_19",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_45",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "dtype_cast_37",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3",
-      "name": "alias_default_115",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "convert_element_type_96",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_96",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "alias_default_117",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_117",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "pow_9",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "mean_8",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_8",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "add_20",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "rsqrt_8",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_8",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "alias_default_118",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_117",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_118",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "mul_28",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_37",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "alias_default_116",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_28",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_116",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "mul_29",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "convert_element_type_97",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_38",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wq",
-      "name": "dtype_cast_38",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_38",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wq",
-      "name": "permute_44",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_97",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "alias_default_119",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_44",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wq",
-      "name": "alias_default_120",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_119",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_120",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wq",
-      "name": "einsum_default_28",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_39",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wk",
-      "name": "dtype_cast_39",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_39",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wk",
-      "name": "permute_45",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_45",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wk",
-      "name": "alias_default_121",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_119",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_121",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wk",
-      "name": "einsum_default_29",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_40",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wv",
-      "name": "dtype_cast_40",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_40",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wv",
-      "name": "permute_46",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_46",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wv",
-      "name": "alias_default_122",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_119",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_122",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wv",
-      "name": "einsum_default_30",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_106",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_107",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_108",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_106",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "convert_element_type_104",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_104",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_109",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_109",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_as_complex_8",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_107",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "convert_element_type_105",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_105",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_110",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_110",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_as_complex_9",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_111",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_111",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "alias_default_123",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_123",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "mul_30",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_as_real_8",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_112",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_123",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "mul_31",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_as_real_9",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_113",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_112",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "convert_element_type_106",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_113",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "convert_element_type_107",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_107",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "unsqueeze_8",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "expand_8",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "clone_8",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_114",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_108",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "unsqueeze_9",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "expand_9",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "clone_9",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_115",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_106",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "permute_47",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_114",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "permute_48",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_115",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "permute_49",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_47",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "alias_default_124",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_48",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "alias_default_125",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "alias_default_126",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_124",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_125",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_126",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_4",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.sdpa",
-      "name": "getitem_36",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.sdpa",
-      "name": "getitem_37",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_4",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.sdpa",
-      "name": "getitem_42",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_4",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.sdpa",
-      "name": "getitem_43",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_36",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.sdpa",
-      "name": "alias_default_127",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_127",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "permute_50",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_50",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_116",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_41",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "dtype_cast_41",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_41",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "permute_51",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_116",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "alias_default_128",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_51",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "alias_default_129",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_128",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_129",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "einsum_default_31",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4",
-      "name": "add_21",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_46",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "dtype_cast_42",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4",
-      "name": "alias_default_130",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_130",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "convert_element_type_110",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_110",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "alias_default_132",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_132",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "pow_10",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "mean_9",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "add_22",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_22",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "rsqrt_9",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "alias_default_133",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_132",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_133",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "mul_32",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_42",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "alias_default_131",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_32",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_131",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "mul_33",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_33",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "convert_element_type_111",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_42",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "dtype_cast_43",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_43",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "permute_52",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_111",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "alias_default_134",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_52",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "alias_default_135",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_134",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_135",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "einsum_default_32",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "alias_default_136",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_136",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "convert_element_type_114",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_114",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "alias_default_137",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_137",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "neg_4",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "exp_4",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "add_23",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_137",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "div_4",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "convert_element_type_115",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_44",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "dtype_cast_44",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_44",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "permute_53",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_53",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "alias_default_139",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_134",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_139",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "einsum_default_33",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_115",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "alias_default_138",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "alias_default_140",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_138",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_140",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "mul_34",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_43",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "dtype_cast_45",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_45",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "permute_54",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "alias_default_141",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_54",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "alias_default_142",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_141",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_142",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "einsum_default_34",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_130",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_34",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4",
-      "name": "add_24",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_54",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "dtype_cast_46",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4",
-      "name": "alias_default_143",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_143",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "convert_element_type_120",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_120",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "alias_default_145",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_145",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "pow_11",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "mean_10",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "add_25",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "rsqrt_10",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "alias_default_146",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_145",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_146",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "mul_35",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_46",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "alias_default_144",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_35",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_144",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "mul_36",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_36",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "convert_element_type_121",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_47",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wq",
-      "name": "dtype_cast_47",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_47",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wq",
-      "name": "permute_55",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_121",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "alias_default_147",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_55",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wq",
-      "name": "alias_default_148",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_147",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_148",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wq",
-      "name": "einsum_default_35",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_48",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wk",
-      "name": "dtype_cast_48",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_48",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wk",
-      "name": "permute_56",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_56",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wk",
-      "name": "alias_default_149",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_147",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_149",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wk",
-      "name": "einsum_default_36",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_49",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wv",
-      "name": "dtype_cast_49",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_49",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wv",
-      "name": "permute_57",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_57",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wv",
-      "name": "alias_default_150",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_147",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_150",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wv",
-      "name": "einsum_default_37",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_131",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_36",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_132",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_133",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_131",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "convert_element_type_128",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_128",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_134",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_134",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_as_complex_10",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_132",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "convert_element_type_129",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_129",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_135",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_135",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_as_complex_11",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_136",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_136",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "alias_default_151",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_151",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "mul_37",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_as_real_10",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_137",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_151",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "mul_38",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_as_real_11",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_138",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_137",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "convert_element_type_130",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_138",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "convert_element_type_131",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_131",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "unsqueeze_10",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "expand_10",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "clone_10",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_139",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_133",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "unsqueeze_11",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "expand_11",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "clone_11",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_140",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_130",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "permute_58",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_139",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "permute_59",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_140",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "permute_60",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_58",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "alias_default_152",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "alias_default_153",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_60",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "alias_default_154",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_152",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_153",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_154",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_5",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.sdpa",
-      "name": "getitem_45",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.sdpa",
-      "name": "getitem_46",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_5",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.sdpa",
-      "name": "getitem_51",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_5",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.sdpa",
-      "name": "getitem_52",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_45",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.sdpa",
-      "name": "alias_default_155",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_155",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "permute_61",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_141",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_50",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "dtype_cast_50",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_50",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "permute_62",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_141",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "alias_default_156",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_62",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "alias_default_157",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_156",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_157",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "einsum_default_38",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_143",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_38",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5",
-      "name": "add_26",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_55",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "dtype_cast_51",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5",
-      "name": "alias_default_158",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_158",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "convert_element_type_134",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_134",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "alias_default_160",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_160",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "pow_12",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "mean_11",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "add_27",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "rsqrt_11",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "alias_default_161",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_160",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_161",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "mul_39",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_51",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "alias_default_159",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_39",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_159",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "mul_40",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_40",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "convert_element_type_135",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_51",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "dtype_cast_52",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_52",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "permute_63",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_135",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "alias_default_162",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_63",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "alias_default_163",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_162",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_163",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "einsum_default_39",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "alias_default_164",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_164",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "convert_element_type_138",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_138",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "alias_default_165",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_165",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "neg_5",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "exp_5",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "add_28",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_165",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "div_5",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "convert_element_type_139",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_53",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "dtype_cast_53",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_53",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "permute_64",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_64",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "alias_default_167",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_162",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_167",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "einsum_default_40",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_139",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "alias_default_166",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "alias_default_168",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_166",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_168",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "mul_41",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_52",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "dtype_cast_54",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_54",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "permute_65",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_41",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "alias_default_169",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_65",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "alias_default_170",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_169",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_170",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "einsum_default_41",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_158",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_41",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5",
-      "name": "add_29",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_63",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "dtype_cast_55",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5",
-      "name": "alias_default_171",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "convert_element_type_144",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_144",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "alias_default_173",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_173",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "pow_13",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "mean_12",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "add_30",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_30",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "rsqrt_12",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "alias_default_174",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_173",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_174",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "mul_42",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_55",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "alias_default_172",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_42",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_172",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "mul_43",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_43",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "convert_element_type_145",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_56",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wq",
-      "name": "dtype_cast_56",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_56",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wq",
-      "name": "permute_66",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_145",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "alias_default_175",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_66",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wq",
-      "name": "alias_default_176",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_175",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_176",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wq",
-      "name": "einsum_default_42",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_57",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wk",
-      "name": "dtype_cast_57",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_57",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wk",
-      "name": "permute_67",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_67",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wk",
-      "name": "alias_default_177",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_175",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_177",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wk",
-      "name": "einsum_default_43",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_58",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wv",
-      "name": "dtype_cast_58",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_58",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wv",
-      "name": "permute_68",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_68",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wv",
-      "name": "alias_default_178",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_175",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_178",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wv",
-      "name": "einsum_default_44",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_156",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_157",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_44",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_158",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_156",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "convert_element_type_152",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_152",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_159",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_159",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_as_complex_12",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_157",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "convert_element_type_153",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_153",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_160",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_160",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_as_complex_13",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_161",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_161",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "alias_default_179",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_179",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "mul_44",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_44",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_as_real_12",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_162",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_179",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "mul_45",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_45",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_as_real_13",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_163",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_162",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "convert_element_type_154",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_163",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "convert_element_type_155",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_155",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "unsqueeze_12",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "expand_12",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "clone_12",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_164",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_158",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "unsqueeze_13",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "expand_13",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "clone_13",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_165",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_154",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "permute_69",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_164",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "permute_70",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_165",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "permute_71",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_69",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "alias_default_180",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_70",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "alias_default_181",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_71",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "alias_default_182",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_180",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_181",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_182",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_6",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.sdpa",
-      "name": "getitem_54",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.sdpa",
-      "name": "getitem_55",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_6",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.sdpa",
-      "name": "getitem_60",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_6",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.sdpa",
-      "name": "getitem_61",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_54",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.sdpa",
-      "name": "alias_default_183",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_183",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "permute_72",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_72",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_166",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_59",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "dtype_cast_59",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_59",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "permute_73",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_166",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "alias_default_184",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_73",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "alias_default_185",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_184",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_185",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "einsum_default_45",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_45",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6",
-      "name": "add_31",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_64",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "dtype_cast_60",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6",
-      "name": "alias_default_186",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_186",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "convert_element_type_158",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_158",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "alias_default_188",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_188",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "pow_14",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "mean_13",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "add_32",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_32",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "rsqrt_13",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "alias_default_189",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_188",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_189",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "mul_46",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_60",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "alias_default_187",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_46",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_187",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "mul_47",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_47",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "convert_element_type_159",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_60",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "dtype_cast_61",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_61",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "permute_74",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_159",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "alias_default_190",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_74",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "alias_default_191",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_190",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_191",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "einsum_default_46",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_46",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "alias_default_192",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_192",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "convert_element_type_162",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_162",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "alias_default_193",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_193",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "neg_6",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "exp_6",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "add_33",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_193",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "div_6",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "convert_element_type_163",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_62",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "dtype_cast_62",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_62",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "permute_75",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_75",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "alias_default_195",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_190",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_195",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "einsum_default_47",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_163",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "alias_default_194",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_47",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "alias_default_196",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_194",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_196",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "mul_48",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_61",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "dtype_cast_63",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_63",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "permute_76",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "alias_default_197",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_76",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "alias_default_198",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_197",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_198",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "einsum_default_48",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_186",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_48",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6",
-      "name": "add_34",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_72",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "dtype_cast_64",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_34",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6",
-      "name": "alias_default_199",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_199",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "convert_element_type_168",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_168",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "alias_default_201",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_201",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "pow_15",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "mean_14",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "add_35",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_35",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "rsqrt_14",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "alias_default_202",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_201",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_202",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "mul_49",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_64",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "alias_default_200",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_200",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "mul_50",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_50",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "convert_element_type_169",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_65",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wq",
-      "name": "dtype_cast_65",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_65",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wq",
-      "name": "permute_77",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_169",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "alias_default_203",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_77",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wq",
-      "name": "alias_default_204",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_203",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_204",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wq",
-      "name": "einsum_default_49",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_66",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wk",
-      "name": "dtype_cast_66",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_66",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wk",
-      "name": "permute_78",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_78",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wk",
-      "name": "alias_default_205",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_203",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_205",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wk",
-      "name": "einsum_default_50",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_67",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wv",
-      "name": "dtype_cast_67",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_67",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wv",
-      "name": "permute_79",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_79",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wv",
-      "name": "alias_default_206",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_203",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_206",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wv",
-      "name": "einsum_default_51",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_49",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_181",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_50",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_182",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_51",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_183",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_181",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "convert_element_type_176",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_176",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_184",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_184",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_as_complex_14",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_182",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "convert_element_type_177",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_177",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_185",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_185",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_as_complex_15",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_186",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_186",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "alias_default_207",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_207",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "mul_51",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_51",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_as_real_14",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_187",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_207",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "mul_52",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_as_real_15",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_188",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_187",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "convert_element_type_178",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_188",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "convert_element_type_179",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_179",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "unsqueeze_14",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "expand_14",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "clone_14",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_189",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_183",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "unsqueeze_15",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "expand_15",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "clone_15",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_190",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_178",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "permute_80",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_189",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "permute_81",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_190",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "permute_82",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_80",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "alias_default_208",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_81",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "alias_default_209",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_82",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "alias_default_210",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_208",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_209",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_210",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_7",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.sdpa",
-      "name": "getitem_63",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.sdpa",
-      "name": "getitem_64",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_7",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.sdpa",
-      "name": "getitem_69",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_7",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.sdpa",
-      "name": "getitem_70",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_63",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.sdpa",
-      "name": "alias_default_211",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_211",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "permute_83",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_83",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_191",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_68",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "dtype_cast_68",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_68",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "permute_84",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_191",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "alias_default_212",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_84",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "alias_default_213",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_212",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_213",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "einsum_default_52",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_199",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_52",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7",
-      "name": "add_36",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_73",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "dtype_cast_69",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_36",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7",
-      "name": "alias_default_214",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_214",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "convert_element_type_182",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_182",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "alias_default_216",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_216",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "pow_16",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "mean_15",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "add_37",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_37",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "rsqrt_15",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "alias_default_217",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_216",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_217",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "mul_53",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_69",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "alias_default_215",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_53",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_215",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "mul_54",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_54",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "convert_element_type_183",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_69",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "dtype_cast_70",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_70",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "permute_85",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_183",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "alias_default_218",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_85",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "alias_default_219",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_218",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_219",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "einsum_default_53",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "alias_default_220",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_220",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "convert_element_type_186",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_186",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "alias_default_221",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_221",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "neg_7",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "exp_7",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "add_38",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_221",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "div_7",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "convert_element_type_187",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_71",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "dtype_cast_71",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_71",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "permute_86",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_86",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "alias_default_223",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_218",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_223",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "einsum_default_54",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_187",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "alias_default_222",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "alias_default_224",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_222",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_224",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "mul_55",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_70",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "dtype_cast_72",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_72",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "permute_87",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_55",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "alias_default_225",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_87",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "alias_default_226",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_225",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_226",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "einsum_default_55",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_214",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_55",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7",
-      "name": "add_39",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_81",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "dtype_cast_73",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_39",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7",
-      "name": "alias_default_227",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_227",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "convert_element_type_192",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_192",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "alias_default_229",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_229",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "pow_17",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "mean_16",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "add_40",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_40",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "rsqrt_16",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "alias_default_230",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_229",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_230",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "mul_56",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_73",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "alias_default_228",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_56",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_228",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "mul_57",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_57",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "convert_element_type_193",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_74",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wq",
-      "name": "dtype_cast_74",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_74",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wq",
-      "name": "permute_88",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_193",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "alias_default_231",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_88",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wq",
-      "name": "alias_default_232",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_231",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_232",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wq",
-      "name": "einsum_default_56",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_75",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wk",
-      "name": "dtype_cast_75",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_75",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wk",
-      "name": "permute_89",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_89",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wk",
-      "name": "alias_default_233",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_231",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_233",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wk",
-      "name": "einsum_default_57",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_76",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wv",
-      "name": "dtype_cast_76",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_76",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wv",
-      "name": "permute_90",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_90",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wv",
-      "name": "alias_default_234",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_231",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_234",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wv",
-      "name": "einsum_default_58",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_206",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_207",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_208",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_206",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "convert_element_type_200",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_200",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_209",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_209",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_as_complex_16",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_207",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "convert_element_type_201",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_201",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_210",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_210",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_as_complex_17",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_211",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_211",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "alias_default_235",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_235",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "mul_58",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_as_real_16",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_212",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_235",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "mul_59",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_as_real_17",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_213",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_212",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "convert_element_type_202",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_213",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "convert_element_type_203",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_203",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "unsqueeze_16",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "expand_16",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "clone_16",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_214",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_208",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "unsqueeze_17",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "expand_17",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "clone_17",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_215",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_202",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "permute_91",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_214",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "permute_92",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_215",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "permute_93",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_91",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "alias_default_236",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_92",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "alias_default_237",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_93",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "alias_default_238",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_236",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_237",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_238",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_8",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_8",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.sdpa",
-      "name": "getitem_72",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_8",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.sdpa",
-      "name": "getitem_73",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_8",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.sdpa",
-      "name": "getitem_78",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_8",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.sdpa",
-      "name": "getitem_79",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_72",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.sdpa",
-      "name": "alias_default_239",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_239",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "permute_94",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_94",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_216",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_77",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "dtype_cast_77",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_77",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "permute_95",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_216",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "alias_default_240",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_95",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "alias_default_241",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_240",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_241",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "einsum_default_59",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_227",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8",
-      "name": "add_41",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_82",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "dtype_cast_78",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_41",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8",
-      "name": "alias_default_242",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_242",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "convert_element_type_206",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_206",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "alias_default_244",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_244",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "pow_18",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "mean_17",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "add_42",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_42",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "rsqrt_17",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "alias_default_245",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_244",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_245",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "mul_60",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_78",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "alias_default_243",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_60",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_243",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "mul_61",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_61",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "convert_element_type_207",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_78",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "dtype_cast_79",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_79",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "permute_96",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_207",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "alias_default_246",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_96",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "alias_default_247",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_246",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_247",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "einsum_default_60",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "alias_default_248",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_248",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "convert_element_type_210",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_210",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "alias_default_249",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_249",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "neg_8",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "exp_8",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "add_43",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_249",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "div_8",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "convert_element_type_211",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_80",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "dtype_cast_80",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_80",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "permute_97",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_97",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "alias_default_251",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_246",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_251",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "einsum_default_61",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_211",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "alias_default_250",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "alias_default_252",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_250",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_252",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "mul_62",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_79",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "dtype_cast_81",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_81",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "permute_98",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_62",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "alias_default_253",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_98",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "alias_default_254",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_253",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_254",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "einsum_default_62",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_242",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_62",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8",
-      "name": "add_44",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_90",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "dtype_cast_82",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_44",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8",
-      "name": "alias_default_255",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_255",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "convert_element_type_216",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_216",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "alias_default_257",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_257",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "pow_19",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "mean_18",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "add_45",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_45",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "rsqrt_18",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "alias_default_258",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_257",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_258",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "mul_63",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_82",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "alias_default_256",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_63",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_256",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "mul_64",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_64",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "convert_element_type_217",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_83",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wq",
-      "name": "dtype_cast_83",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_83",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wq",
-      "name": "permute_99",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_217",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "alias_default_259",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_99",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wq",
-      "name": "alias_default_260",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_259",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_260",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wq",
-      "name": "einsum_default_63",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_84",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wk",
-      "name": "dtype_cast_84",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_84",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wk",
-      "name": "permute_100",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_100",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wk",
-      "name": "alias_default_261",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_259",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_261",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wk",
-      "name": "einsum_default_64",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_85",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wv",
-      "name": "dtype_cast_85",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_85",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wv",
-      "name": "permute_101",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_101",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wv",
-      "name": "alias_default_262",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_259",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_262",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wv",
-      "name": "einsum_default_65",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_231",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_64",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_232",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_65",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_233",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_231",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "convert_element_type_224",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_224",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_234",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_234",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_as_complex_18",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_232",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "convert_element_type_225",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_225",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_235",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_235",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_as_complex_19",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_236",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_236",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "alias_default_263",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_263",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "mul_65",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_65",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_as_real_18",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_237",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_263",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "mul_66",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_66",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_as_real_19",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_238",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_237",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "convert_element_type_226",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_238",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "convert_element_type_227",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_227",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "unsqueeze_18",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "expand_18",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "clone_18",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_239",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_233",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "unsqueeze_19",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "expand_19",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "clone_19",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_240",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_226",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "permute_102",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_239",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "permute_103",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_240",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "permute_104",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_102",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "alias_default_264",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_103",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "alias_default_265",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_104",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "alias_default_266",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_264",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_265",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_266",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_9",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.sdpa",
-      "name": "getitem_81",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.sdpa",
-      "name": "getitem_82",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_9",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.sdpa",
-      "name": "getitem_87",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_9",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.sdpa",
-      "name": "getitem_88",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_81",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.sdpa",
-      "name": "alias_default_267",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_267",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "permute_105",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_105",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_241",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_86",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "dtype_cast_86",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_86",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "permute_106",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_241",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "alias_default_268",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_106",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "alias_default_269",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_268",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_269",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "einsum_default_66",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_255",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_66",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9",
-      "name": "add_46",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_91",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "dtype_cast_87",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_46",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9",
-      "name": "alias_default_270",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_270",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "convert_element_type_230",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_230",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "alias_default_272",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_272",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "pow_20",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "mean_19",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "add_47",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_47",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "rsqrt_19",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "alias_default_273",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_272",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_273",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "mul_67",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_87",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "alias_default_271",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_67",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_271",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "mul_68",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_68",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "convert_element_type_231",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_87",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "dtype_cast_88",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_88",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "permute_107",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_231",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "alias_default_274",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_107",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "alias_default_275",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_274",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_275",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "einsum_default_67",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_67",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "alias_default_276",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_276",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "convert_element_type_234",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_234",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "alias_default_277",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_277",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "neg_9",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "exp_9",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "add_48",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_277",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "div_9",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "convert_element_type_235",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_89",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "dtype_cast_89",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_89",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "permute_108",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_108",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "alias_default_279",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_274",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_279",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "einsum_default_68",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_235",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "alias_default_278",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_68",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "alias_default_280",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_278",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_280",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "mul_69",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_88",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "dtype_cast_90",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_90",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "permute_109",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_69",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "alias_default_281",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_109",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "alias_default_282",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_281",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_282",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "einsum_default_69",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_270",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_69",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9",
-      "name": "add_49",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_99",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "dtype_cast_91",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9",
-      "name": "alias_default_283",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_283",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "convert_element_type_240",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_240",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "alias_default_285",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_285",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "pow_21",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "mean_20",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "add_50",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_50",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "rsqrt_20",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "alias_default_286",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_285",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_286",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "mul_70",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_91",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "alias_default_284",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_70",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_284",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "mul_71",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_71",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "convert_element_type_241",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_92",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wq",
-      "name": "dtype_cast_92",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_92",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wq",
-      "name": "permute_110",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_241",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "alias_default_287",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_110",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wq",
-      "name": "alias_default_288",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_287",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_288",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wq",
-      "name": "einsum_default_70",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_93",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wk",
-      "name": "dtype_cast_93",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_93",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wk",
-      "name": "permute_111",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_111",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wk",
-      "name": "alias_default_289",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_287",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_289",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wk",
-      "name": "einsum_default_71",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_94",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wv",
-      "name": "dtype_cast_94",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_94",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wv",
-      "name": "permute_112",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_112",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wv",
-      "name": "alias_default_290",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_287",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_290",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wv",
-      "name": "einsum_default_72",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_70",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_256",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_71",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_257",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_72",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_258",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_256",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "convert_element_type_248",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_248",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_259",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_259",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_as_complex_20",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_257",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "convert_element_type_249",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_249",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_260",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_260",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_as_complex_21",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_261",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_261",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "alias_default_291",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_291",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "mul_72",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_72",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_as_real_20",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_262",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_291",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "mul_73",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_73",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_as_real_21",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_263",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_262",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "convert_element_type_250",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_263",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "convert_element_type_251",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_251",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "unsqueeze_20",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "expand_20",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "clone_20",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_264",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_258",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "unsqueeze_21",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "expand_21",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "clone_21",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_265",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_250",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "permute_113",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_264",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "permute_114",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_265",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "permute_115",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_113",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "alias_default_292",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_114",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "alias_default_293",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "alias_default_294",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_292",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_293",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_294",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_10",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.sdpa",
-      "name": "getitem_90",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.sdpa",
-      "name": "getitem_91",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_10",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.sdpa",
-      "name": "getitem_96",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_10",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.sdpa",
-      "name": "getitem_97",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_90",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.sdpa",
-      "name": "alias_default_295",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_295",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "permute_116",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_116",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_266",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_95",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "dtype_cast_95",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_95",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "permute_117",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_266",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "alias_default_296",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_117",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "alias_default_297",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_296",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_297",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "einsum_default_73",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_283",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_73",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10",
-      "name": "add_51",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_100",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "dtype_cast_96",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_51",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10",
-      "name": "alias_default_298",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_298",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "convert_element_type_254",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_254",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "alias_default_300",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_300",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "pow_22",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_22",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "mean_21",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "add_52",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_52",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "rsqrt_21",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "alias_default_301",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_300",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_301",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "mul_74",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_96",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "alias_default_299",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_74",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_299",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "mul_75",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_75",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "convert_element_type_255",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_96",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "dtype_cast_97",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_97",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "permute_118",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_255",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "alias_default_302",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_118",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "alias_default_303",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_302",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_303",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "einsum_default_74",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_74",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "alias_default_304",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_304",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "convert_element_type_258",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_258",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "alias_default_305",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_305",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "neg_10",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "exp_10",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "add_53",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_305",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "div_10",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "convert_element_type_259",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_98",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "dtype_cast_98",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_98",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "permute_119",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_119",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "alias_default_307",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_302",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_307",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "einsum_default_75",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_259",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "alias_default_306",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_75",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "alias_default_308",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_306",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_308",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "mul_76",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_97",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "dtype_cast_99",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_99",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "permute_120",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_76",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "alias_default_309",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_120",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "alias_default_310",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_309",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_310",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "einsum_default_76",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_298",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_76",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10",
-      "name": "add_54",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_108",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "dtype_cast_100",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_54",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10",
-      "name": "alias_default_311",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_311",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "convert_element_type_264",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_264",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "alias_default_313",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_313",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "pow_23",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_23",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "mean_22",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_22",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "add_55",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_55",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "rsqrt_22",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_22",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "alias_default_314",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_313",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_314",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "mul_77",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_100",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "alias_default_312",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_77",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_312",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "mul_78",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_78",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "convert_element_type_265",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_101",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wq",
-      "name": "dtype_cast_101",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_101",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wq",
-      "name": "permute_121",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_265",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "alias_default_315",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_121",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wq",
-      "name": "alias_default_316",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_315",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_316",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wq",
-      "name": "einsum_default_77",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_102",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wk",
-      "name": "dtype_cast_102",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_102",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wk",
-      "name": "permute_122",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_122",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wk",
-      "name": "alias_default_317",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_315",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_317",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wk",
-      "name": "einsum_default_78",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_103",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wv",
-      "name": "dtype_cast_103",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_103",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wv",
-      "name": "permute_123",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_123",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wv",
-      "name": "alias_default_318",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_315",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_318",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wv",
-      "name": "einsum_default_79",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_77",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_281",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_78",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_282",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_79",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_283",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_281",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "convert_element_type_272",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_272",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_284",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_284",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_as_complex_22",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_282",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "convert_element_type_273",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_273",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_285",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_285",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_as_complex_23",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_286",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_286",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "alias_default_319",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_319",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "mul_79",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_79",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_as_real_22",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_287",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_319",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "mul_80",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_80",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_as_real_23",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_288",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_287",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "convert_element_type_274",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_288",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "convert_element_type_275",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_275",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "unsqueeze_22",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "expand_22",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "clone_22",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_289",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_283",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "unsqueeze_23",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "expand_23",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "clone_23",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_290",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_274",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "permute_124",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_289",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "permute_125",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_290",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "permute_126",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_124",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "alias_default_320",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_125",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "alias_default_321",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_126",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "alias_default_322",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_320",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_321",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_322",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_11",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.sdpa",
-      "name": "getitem_99",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.sdpa",
-      "name": "getitem_100",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_11",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.sdpa",
-      "name": "getitem_105",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_11",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.sdpa",
-      "name": "getitem_106",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_99",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.sdpa",
-      "name": "alias_default_323",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_323",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "permute_127",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_127",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_291",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_104",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "dtype_cast_104",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_104",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "permute_128",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_291",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "alias_default_324",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_128",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "alias_default_325",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_324",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_325",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "einsum_default_80",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_311",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_80",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11",
-      "name": "add_56",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_109",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "dtype_cast_105",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_56",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11",
-      "name": "alias_default_326",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_326",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "convert_element_type_278",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_278",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "alias_default_328",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_328",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "pow_24",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "mean_23",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_23",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "add_57",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_57",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "rsqrt_23",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_23",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "alias_default_329",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_328",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_329",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "mul_81",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_105",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "alias_default_327",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_81",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_327",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "mul_82",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_82",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "convert_element_type_279",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_105",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "dtype_cast_106",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_106",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "permute_129",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_279",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "alias_default_330",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_129",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "alias_default_331",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_330",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_331",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "einsum_default_81",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_81",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "alias_default_332",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_332",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "convert_element_type_282",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_282",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "alias_default_333",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_333",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "neg_11",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "exp_11",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "add_58",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_333",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "div_11",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "convert_element_type_283",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_107",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "dtype_cast_107",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_107",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "permute_130",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_130",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "alias_default_335",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_330",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_335",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "einsum_default_82",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_283",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "alias_default_334",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_82",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "alias_default_336",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_334",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_336",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "mul_83",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_106",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "dtype_cast_108",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_108",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "permute_131",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_83",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "alias_default_337",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_131",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "alias_default_338",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_337",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_338",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "einsum_default_83",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_326",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_83",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11",
-      "name": "add_59",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_117",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "dtype_cast_109",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11",
-      "name": "alias_default_339",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_339",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "convert_element_type_288",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_288",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "alias_default_341",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_341",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "pow_25",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "mean_24",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "add_60",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_60",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "rsqrt_24",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "alias_default_342",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_341",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_342",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "mul_84",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_109",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "alias_default_340",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_84",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_340",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "mul_85",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_85",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "convert_element_type_289",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_110",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wq",
-      "name": "dtype_cast_110",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_110",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wq",
-      "name": "permute_132",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_289",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "alias_default_343",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_132",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wq",
-      "name": "alias_default_344",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_343",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_344",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wq",
-      "name": "einsum_default_84",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_111",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wk",
-      "name": "dtype_cast_111",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_111",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wk",
-      "name": "permute_133",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_133",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wk",
-      "name": "alias_default_345",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_343",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_345",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wk",
-      "name": "einsum_default_85",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_112",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wv",
-      "name": "dtype_cast_112",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_112",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wv",
-      "name": "permute_134",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_134",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wv",
-      "name": "alias_default_346",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_343",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_346",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wv",
-      "name": "einsum_default_86",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_84",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_306",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_85",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_307",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_86",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_308",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_306",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "convert_element_type_296",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_296",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_309",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_309",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_as_complex_24",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_307",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "convert_element_type_297",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_297",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_310",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_310",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_as_complex_25",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_311",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_311",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "alias_default_347",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_347",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "mul_86",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_86",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_as_real_24",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_312",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_347",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "mul_87",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_87",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_as_real_25",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_313",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_312",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "convert_element_type_298",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_313",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "convert_element_type_299",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_299",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "unsqueeze_24",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "expand_24",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "clone_24",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_314",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_308",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "unsqueeze_25",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "expand_25",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "clone_25",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_315",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_298",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "permute_135",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_314",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "permute_136",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_315",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "permute_137",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_135",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "alias_default_348",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_136",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "alias_default_349",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_137",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "alias_default_350",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_348",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_349",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_350",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_12",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.sdpa",
-      "name": "getitem_108",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.sdpa",
-      "name": "getitem_109",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_12",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.sdpa",
-      "name": "getitem_114",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_12",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.sdpa",
-      "name": "getitem_115",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_108",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.sdpa",
-      "name": "alias_default_351",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_351",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "permute_138",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_138",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_316",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_113",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "dtype_cast_113",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_113",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "permute_139",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_316",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "alias_default_352",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_139",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "alias_default_353",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_352",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_353",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "einsum_default_87",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_339",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_87",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12",
-      "name": "add_61",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_118",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "dtype_cast_114",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_61",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12",
-      "name": "alias_default_354",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_354",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "convert_element_type_302",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_302",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "alias_default_356",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_356",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "pow_26",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "mean_25",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "add_62",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_62",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "rsqrt_25",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "alias_default_357",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_356",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_357",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "mul_88",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_114",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "alias_default_355",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_88",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_355",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "mul_89",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_89",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "convert_element_type_303",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_114",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "dtype_cast_115",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_115",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "permute_140",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_303",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "alias_default_358",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_140",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "alias_default_359",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_358",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_359",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "einsum_default_88",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_88",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "alias_default_360",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_360",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "convert_element_type_306",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_306",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "alias_default_361",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_361",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "neg_12",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "exp_12",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "add_63",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_361",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "div_12",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "convert_element_type_307",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_116",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "dtype_cast_116",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_116",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "permute_141",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_141",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "alias_default_363",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_358",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_363",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "einsum_default_89",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_307",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "alias_default_362",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_89",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "alias_default_364",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_362",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_364",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "mul_90",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "dtype_cast_117",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_117",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "permute_142",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_90",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "alias_default_365",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_142",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "alias_default_366",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_365",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_366",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "einsum_default_90",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_354",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_90",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12",
-      "name": "add_64",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_126",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "dtype_cast_118",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_64",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12",
-      "name": "alias_default_367",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_367",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "convert_element_type_312",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_312",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "alias_default_369",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_369",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "pow_27",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "mean_26",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "add_65",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_65",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "rsqrt_26",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "alias_default_370",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_369",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_370",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "mul_91",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_118",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "alias_default_368",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_91",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_368",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "mul_92",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_92",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "convert_element_type_313",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_119",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wq",
-      "name": "dtype_cast_119",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_119",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wq",
-      "name": "permute_143",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_313",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "alias_default_371",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_143",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wq",
-      "name": "alias_default_372",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_371",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_372",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wq",
-      "name": "einsum_default_91",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_120",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wk",
-      "name": "dtype_cast_120",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_120",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wk",
-      "name": "permute_144",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_144",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wk",
-      "name": "alias_default_373",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_371",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_373",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wk",
-      "name": "einsum_default_92",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_121",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wv",
-      "name": "dtype_cast_121",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_121",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wv",
-      "name": "permute_145",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_145",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wv",
-      "name": "alias_default_374",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_371",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_374",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wv",
-      "name": "einsum_default_93",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_91",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_331",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_92",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_332",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_93",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_333",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_331",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "convert_element_type_320",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_320",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_334",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_334",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_as_complex_26",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_332",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "convert_element_type_321",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_321",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_335",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_335",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_as_complex_27",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_336",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_336",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "alias_default_375",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_375",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "mul_93",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_93",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_as_real_26",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_337",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_375",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "mul_94",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_94",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_as_real_27",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_338",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_337",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "convert_element_type_322",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_338",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "convert_element_type_323",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_323",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "unsqueeze_26",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "expand_26",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "clone_26",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_339",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_333",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "unsqueeze_27",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "expand_27",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "clone_27",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_340",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_322",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "permute_146",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_339",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "permute_147",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_340",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "permute_148",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_146",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "alias_default_376",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_147",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "alias_default_377",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_148",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "alias_default_378",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_376",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_377",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_378",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_13",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.sdpa",
-      "name": "getitem_117",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.sdpa",
-      "name": "getitem_118",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_13",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.sdpa",
-      "name": "getitem_123",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_13",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.sdpa",
-      "name": "getitem_124",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_117",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.sdpa",
-      "name": "alias_default_379",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_379",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "permute_149",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_149",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_341",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_122",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "dtype_cast_122",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_122",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "permute_150",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_341",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "alias_default_380",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_150",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "alias_default_381",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_380",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_381",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "einsum_default_94",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_367",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_94",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13",
-      "name": "add_66",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_127",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "dtype_cast_123",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_66",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13",
-      "name": "alias_default_382",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_382",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "convert_element_type_326",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_326",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "alias_default_384",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_384",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "pow_28",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_28",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "mean_27",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "add_67",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_67",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "rsqrt_27",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "alias_default_385",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_384",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_385",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "mul_95",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_123",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "alias_default_383",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_95",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_383",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "mul_96",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_96",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "convert_element_type_327",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_123",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "dtype_cast_124",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_124",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "permute_151",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_327",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "alias_default_386",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_151",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "alias_default_387",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_386",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_387",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "einsum_default_95",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_95",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "alias_default_388",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_388",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "convert_element_type_330",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_330",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "alias_default_389",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_389",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "neg_13",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "exp_13",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "add_68",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_389",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_68",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "div_13",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "convert_element_type_331",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_125",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "dtype_cast_125",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_125",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "permute_152",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_152",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "alias_default_391",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_386",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_391",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "einsum_default_96",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_331",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "alias_default_390",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_96",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "alias_default_392",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_390",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_392",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "mul_97",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_124",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "dtype_cast_126",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_126",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "permute_153",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_97",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "alias_default_393",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_153",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "alias_default_394",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_393",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_394",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "einsum_default_97",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_382",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_97",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13",
-      "name": "add_69",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_135",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "dtype_cast_127",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_69",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13",
-      "name": "alias_default_395",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_395",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "convert_element_type_336",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_336",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "alias_default_397",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_397",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "pow_29",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "mean_28",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_28",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "add_70",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_70",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "rsqrt_28",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_28",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "alias_default_398",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_397",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_398",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "mul_98",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_127",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "alias_default_396",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_98",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_396",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "mul_99",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_99",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "convert_element_type_337",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_128",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wq",
-      "name": "dtype_cast_128",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_128",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wq",
-      "name": "permute_154",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_337",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "alias_default_399",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_154",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wq",
-      "name": "alias_default_400",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_399",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_400",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wq",
-      "name": "einsum_default_98",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_129",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wk",
-      "name": "dtype_cast_129",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_129",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wk",
-      "name": "permute_155",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_155",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wk",
-      "name": "alias_default_401",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_399",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_401",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wk",
-      "name": "einsum_default_99",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_130",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wv",
-      "name": "dtype_cast_130",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_130",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wv",
-      "name": "permute_156",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_156",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wv",
-      "name": "alias_default_402",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_399",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_402",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wv",
-      "name": "einsum_default_100",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_98",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_356",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_99",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_357",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_100",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_358",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_356",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "convert_element_type_344",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_344",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_359",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_359",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_as_complex_28",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_357",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "convert_element_type_345",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_345",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_360",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_360",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_as_complex_29",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_361",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_361",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "alias_default_403",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_403",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "mul_100",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_100",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_as_real_28",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_362",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_403",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "mul_101",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_101",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_as_real_29",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_363",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_362",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "convert_element_type_346",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_363",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "convert_element_type_347",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_347",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "unsqueeze_28",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "expand_28",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "clone_28",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_364",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_358",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "unsqueeze_29",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "expand_29",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "clone_29",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_365",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_346",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "permute_157",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_364",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "permute_158",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_365",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "permute_159",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_157",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "alias_default_404",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_158",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "alias_default_405",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_159",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "alias_default_406",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_404",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_405",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_406",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_14",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.sdpa",
-      "name": "getitem_126",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.sdpa",
-      "name": "getitem_127",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_14",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.sdpa",
-      "name": "getitem_132",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_14",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.sdpa",
-      "name": "getitem_133",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_126",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.sdpa",
-      "name": "alias_default_407",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_407",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "permute_160",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_160",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_366",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_131",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "dtype_cast_131",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_131",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "permute_161",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_366",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "alias_default_408",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_161",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "alias_default_409",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_408",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_409",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "einsum_default_101",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_395",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_101",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14",
-      "name": "add_71",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_136",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "dtype_cast_132",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_71",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14",
-      "name": "alias_default_410",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_410",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "convert_element_type_350",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_350",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "alias_default_412",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_412",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "pow_30",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_30",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "mean_29",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "add_72",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_72",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "rsqrt_29",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "alias_default_413",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_412",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_413",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "mul_102",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_132",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "alias_default_411",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_102",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_411",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "mul_103",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_103",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "convert_element_type_351",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_132",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "dtype_cast_133",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_133",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "permute_162",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_351",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "alias_default_414",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_162",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "alias_default_415",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_414",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_415",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "einsum_default_102",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_102",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "alias_default_416",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_416",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "convert_element_type_354",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_354",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "alias_default_417",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_417",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "neg_14",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "exp_14",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "add_73",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_417",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_73",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "div_14",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "convert_element_type_355",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_134",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "dtype_cast_134",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_134",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "permute_163",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_163",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "alias_default_419",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_414",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_419",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "einsum_default_103",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_355",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "alias_default_418",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_103",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "alias_default_420",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_418",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_420",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "mul_104",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_133",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "dtype_cast_135",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_135",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "permute_164",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_104",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "alias_default_421",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_164",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "alias_default_422",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_421",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_422",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "einsum_default_104",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_410",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_104",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14",
-      "name": "add_74",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_144",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "dtype_cast_136",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_74",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14",
-      "name": "alias_default_423",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_423",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "convert_element_type_360",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_360",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "alias_default_425",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_425",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "pow_31",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "mean_30",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_30",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "add_75",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_75",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "rsqrt_30",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_30",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "alias_default_426",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_425",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_426",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "mul_105",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_136",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "alias_default_424",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_105",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_424",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "mul_106",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_106",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "convert_element_type_361",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_137",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wq",
-      "name": "dtype_cast_137",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_137",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wq",
-      "name": "permute_165",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_361",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "alias_default_427",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_165",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wq",
-      "name": "alias_default_428",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_427",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_428",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wq",
-      "name": "einsum_default_105",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_138",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wk",
-      "name": "dtype_cast_138",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_138",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wk",
-      "name": "permute_166",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_166",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wk",
-      "name": "alias_default_429",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_427",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_429",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wk",
-      "name": "einsum_default_106",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_139",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wv",
-      "name": "dtype_cast_139",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_139",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wv",
-      "name": "permute_167",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_167",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wv",
-      "name": "alias_default_430",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_427",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_430",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wv",
-      "name": "einsum_default_107",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_105",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_381",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_106",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_382",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_107",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_383",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_381",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "convert_element_type_368",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_368",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_384",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_384",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_as_complex_30",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_382",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "convert_element_type_369",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_369",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_385",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_385",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_as_complex_31",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_386",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_386",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "alias_default_431",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_431",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "mul_107",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_107",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_as_real_30",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_387",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_431",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "mul_108",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_108",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_as_real_31",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_388",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_387",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "convert_element_type_370",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_388",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "convert_element_type_371",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_371",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "unsqueeze_30",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "expand_30",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "clone_30",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_389",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_383",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "unsqueeze_31",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "expand_31",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "clone_31",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_390",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_370",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "permute_168",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_389",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "permute_169",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_390",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "permute_170",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_168",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "alias_default_432",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_169",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "alias_default_433",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_170",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "alias_default_434",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_432",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_433",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_434",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_15",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.sdpa",
-      "name": "getitem_135",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.sdpa",
-      "name": "getitem_136",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_15",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.sdpa",
-      "name": "getitem_141",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_15",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.sdpa",
-      "name": "getitem_142",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_135",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.sdpa",
-      "name": "alias_default_435",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_435",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "permute_171",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_171",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_391",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_140",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "dtype_cast_140",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_140",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "permute_172",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_391",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "alias_default_436",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_172",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "alias_default_437",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_436",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_437",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "einsum_default_108",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_423",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_108",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15",
-      "name": "add_76",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_145",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "dtype_cast_141",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_76",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15",
-      "name": "alias_default_438",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_438",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "convert_element_type_374",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_374",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "alias_default_440",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_440",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "pow_32",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_32",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "mean_31",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "add_77",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_77",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "rsqrt_31",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "alias_default_441",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_440",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_441",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "mul_109",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_141",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "alias_default_439",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_109",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_439",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "mul_110",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_110",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "convert_element_type_375",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_141",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "dtype_cast_142",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_142",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "permute_173",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_375",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "alias_default_442",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_173",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "alias_default_443",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_442",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_443",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "einsum_default_109",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_109",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "alias_default_444",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_444",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "convert_element_type_378",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_378",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "alias_default_445",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_445",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "neg_15",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "exp_15",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "add_78",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_445",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_78",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "div_15",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "convert_element_type_379",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_143",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "dtype_cast_143",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_143",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "permute_174",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_174",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "alias_default_447",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_442",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_447",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "einsum_default_110",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_379",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "alias_default_446",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_110",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "alias_default_448",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_446",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_448",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "mul_111",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_142",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "dtype_cast_144",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_144",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "permute_175",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_111",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "alias_default_449",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_175",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "alias_default_450",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_449",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_450",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "einsum_default_111",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_438",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_111",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15",
-      "name": "add_79",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_153",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "dtype_cast_145",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_79",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15",
-      "name": "alias_default_451",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_451",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "convert_element_type_384",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_384",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "alias_default_453",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_453",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "pow_33",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_33",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "mean_32",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_32",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "add_80",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_80",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "rsqrt_32",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_32",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "alias_default_454",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_453",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_454",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "mul_112",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_145",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "alias_default_452",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_112",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_452",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "mul_113",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_113",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "convert_element_type_385",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_146",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wq",
-      "name": "dtype_cast_146",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_146",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wq",
-      "name": "permute_176",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_385",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "alias_default_455",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_176",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wq",
-      "name": "alias_default_456",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_455",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_456",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wq",
-      "name": "einsum_default_112",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_147",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wk",
-      "name": "dtype_cast_147",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_147",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wk",
-      "name": "permute_177",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_177",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wk",
-      "name": "alias_default_457",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_455",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_457",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wk",
-      "name": "einsum_default_113",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_148",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wv",
-      "name": "dtype_cast_148",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_148",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wv",
-      "name": "permute_178",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_178",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wv",
-      "name": "alias_default_458",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_455",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_458",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wv",
-      "name": "einsum_default_114",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_112",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_406",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_113",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_407",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_114",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_408",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_406",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "convert_element_type_392",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_392",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_409",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_409",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_as_complex_32",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_407",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "convert_element_type_393",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_393",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_410",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_410",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_as_complex_33",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_411",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_411",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "alias_default_459",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_459",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "mul_114",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_114",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_as_real_32",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_412",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_459",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "mul_115",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_115",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_as_real_33",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_413",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_412",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "convert_element_type_394",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_413",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "convert_element_type_395",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_395",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "unsqueeze_32",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "expand_32",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "clone_32",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_414",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_408",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "unsqueeze_33",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "expand_33",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "clone_33",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_415",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_394",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "permute_179",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_414",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "permute_180",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_415",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "permute_181",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_179",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "alias_default_460",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_180",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "alias_default_461",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_181",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "alias_default_462",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_460",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_461",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_462",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_16",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.sdpa",
-      "name": "getitem_144",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.sdpa",
-      "name": "getitem_145",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_16",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.sdpa",
-      "name": "getitem_150",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_16",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.sdpa",
-      "name": "getitem_151",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_144",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.sdpa",
-      "name": "alias_default_463",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_463",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "permute_182",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_182",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_416",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_149",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "dtype_cast_149",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_149",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "permute_183",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_416",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "alias_default_464",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_183",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "alias_default_465",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_464",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_465",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "einsum_default_115",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_451",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16",
-      "name": "add_81",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_154",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "dtype_cast_150",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_81",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16",
-      "name": "alias_default_466",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_466",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "convert_element_type_398",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_398",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "alias_default_468",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_468",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "pow_34",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_34",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "mean_33",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_33",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "add_82",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_82",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "rsqrt_33",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_33",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "alias_default_469",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_468",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_469",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "mul_116",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_150",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "alias_default_467",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_116",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_467",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "mul_117",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_117",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "convert_element_type_399",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_150",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "dtype_cast_151",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_151",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "permute_184",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_399",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "alias_default_470",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_184",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "alias_default_471",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_470",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_471",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "einsum_default_116",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_116",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "alias_default_472",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_472",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "convert_element_type_402",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_402",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "alias_default_473",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_473",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "neg_16",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "exp_16",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "add_83",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_473",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_83",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "div_16",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "convert_element_type_403",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_152",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "dtype_cast_152",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_152",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "permute_185",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_185",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "alias_default_475",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_470",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_475",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "einsum_default_117",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_403",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "alias_default_474",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_117",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "alias_default_476",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_474",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_476",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "mul_118",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_151",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "dtype_cast_153",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_153",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "permute_186",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_118",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "alias_default_477",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_186",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "alias_default_478",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_477",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_478",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "einsum_default_118",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_466",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_118",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16",
-      "name": "add_84",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_162",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "dtype_cast_154",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_84",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16",
-      "name": "alias_default_479",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_479",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "convert_element_type_408",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_408",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "alias_default_481",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_481",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "pow_35",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_35",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "mean_34",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_34",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "add_85",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_85",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "rsqrt_34",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_34",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "alias_default_482",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_481",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_482",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "mul_119",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_154",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "alias_default_480",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_119",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_480",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "mul_120",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_120",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "convert_element_type_409",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_155",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wq",
-      "name": "dtype_cast_155",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_155",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wq",
-      "name": "permute_187",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_409",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "alias_default_483",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_187",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wq",
-      "name": "alias_default_484",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_483",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_484",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wq",
-      "name": "einsum_default_119",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_156",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wk",
-      "name": "dtype_cast_156",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_156",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wk",
-      "name": "permute_188",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_188",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wk",
-      "name": "alias_default_485",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_483",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_485",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wk",
-      "name": "einsum_default_120",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_157",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wv",
-      "name": "dtype_cast_157",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_157",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wv",
-      "name": "permute_189",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_189",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wv",
-      "name": "alias_default_486",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_483",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_486",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wv",
-      "name": "einsum_default_121",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_119",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_431",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_120",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_432",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_121",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_433",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_431",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "convert_element_type_416",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_416",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_434",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_434",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_as_complex_34",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_432",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "convert_element_type_417",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_417",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_435",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_435",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_as_complex_35",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_436",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_436",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "alias_default_487",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_487",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "mul_121",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_121",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_as_real_34",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_437",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_487",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "mul_122",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_122",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_as_real_35",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_438",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_437",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "convert_element_type_418",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_438",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "convert_element_type_419",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_419",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "unsqueeze_34",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "expand_34",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "clone_34",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_439",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_433",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "unsqueeze_35",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "expand_35",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "clone_35",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_440",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_418",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "permute_190",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_439",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "permute_191",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_440",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "permute_192",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_190",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "alias_default_488",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_191",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "alias_default_489",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_192",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "alias_default_490",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_488",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_489",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_490",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_17",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.sdpa",
-      "name": "getitem_153",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.sdpa",
-      "name": "getitem_154",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_17",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.sdpa",
-      "name": "getitem_159",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_17",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.sdpa",
-      "name": "getitem_160",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_153",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.sdpa",
-      "name": "alias_default_491",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_491",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "permute_193",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_193",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_441",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_158",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "dtype_cast_158",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_158",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "permute_194",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_441",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "alias_default_492",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_194",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "alias_default_493",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_492",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_493",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "einsum_default_122",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_479",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_122",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17",
-      "name": "add_86",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_163",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "dtype_cast_159",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_86",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17",
-      "name": "alias_default_494",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_494",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "convert_element_type_422",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_422",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "alias_default_496",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_496",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "pow_36",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_36",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "mean_35",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_35",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "add_87",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_87",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "rsqrt_35",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_35",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "alias_default_497",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_496",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_497",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "mul_123",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_159",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "alias_default_495",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_123",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_495",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "mul_124",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_124",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "convert_element_type_423",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_159",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "dtype_cast_160",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_160",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "permute_195",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_423",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "alias_default_498",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_195",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "alias_default_499",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_498",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_499",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "einsum_default_123",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_123",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "alias_default_500",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_500",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "convert_element_type_426",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_426",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "alias_default_501",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_501",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "neg_17",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "exp_17",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "add_88",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_501",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_88",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "div_17",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "convert_element_type_427",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_161",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "dtype_cast_161",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_161",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "permute_196",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_196",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "alias_default_503",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_498",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_503",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "einsum_default_124",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_427",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "alias_default_502",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_124",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "alias_default_504",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_502",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_504",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "mul_125",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_160",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "dtype_cast_162",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_162",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "permute_197",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_125",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "alias_default_505",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_197",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "alias_default_506",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_505",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_506",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "einsum_default_125",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_494",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_125",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17",
-      "name": "add_89",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_171",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "dtype_cast_163",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_89",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17",
-      "name": "alias_default_507",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_507",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "convert_element_type_432",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_432",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "alias_default_509",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_509",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "pow_37",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_37",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "mean_36",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_36",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "add_90",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_90",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "rsqrt_36",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_36",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "alias_default_510",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_509",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_510",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "mul_126",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_163",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "alias_default_508",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_126",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_508",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "mul_127",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_127",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "convert_element_type_433",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_164",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wq",
-      "name": "dtype_cast_164",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_164",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wq",
-      "name": "permute_198",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_433",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "alias_default_511",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_198",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wq",
-      "name": "alias_default_512",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_511",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_512",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wq",
-      "name": "einsum_default_126",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_165",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wk",
-      "name": "dtype_cast_165",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_165",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wk",
-      "name": "permute_199",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_199",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wk",
-      "name": "alias_default_513",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_511",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_513",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wk",
-      "name": "einsum_default_127",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_166",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wv",
-      "name": "dtype_cast_166",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_166",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wv",
-      "name": "permute_200",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_200",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wv",
-      "name": "alias_default_514",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_511",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_514",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wv",
-      "name": "einsum_default_128",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_126",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_456",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_127",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_457",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_128",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_458",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_456",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "convert_element_type_440",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_440",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_459",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_459",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_as_complex_36",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_457",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "convert_element_type_441",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_441",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_460",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_460",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_as_complex_37",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_461",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_461",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "alias_default_515",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_36",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_515",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "mul_128",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_128",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_as_real_36",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_36",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_462",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_515",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "mul_129",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_129",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_as_real_37",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_463",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_462",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "convert_element_type_442",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_463",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "convert_element_type_443",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_443",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "unsqueeze_36",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_36",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "expand_36",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_36",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "clone_36",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_36",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_464",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_458",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "unsqueeze_37",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "expand_37",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "clone_37",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_465",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_442",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "permute_201",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_464",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "permute_202",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_465",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "permute_203",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_201",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "alias_default_516",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_202",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "alias_default_517",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_203",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "alias_default_518",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_516",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_517",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_518",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_18",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.sdpa",
-      "name": "getitem_162",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.sdpa",
-      "name": "getitem_163",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_18",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.sdpa",
-      "name": "getitem_168",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_18",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.sdpa",
-      "name": "getitem_169",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_162",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.sdpa",
-      "name": "alias_default_519",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_519",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "permute_204",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_204",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_466",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_167",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "dtype_cast_167",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_167",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "permute_205",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_466",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "alias_default_520",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_205",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "alias_default_521",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_520",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_521",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "einsum_default_129",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_507",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_129",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18",
-      "name": "add_91",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_172",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "dtype_cast_168",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_91",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18",
-      "name": "alias_default_522",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_522",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "convert_element_type_446",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_446",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "alias_default_524",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_524",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "pow_38",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_38",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "mean_37",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_37",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "add_92",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_92",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "rsqrt_37",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_37",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "alias_default_525",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_524",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_525",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "mul_130",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_168",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "alias_default_523",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_130",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_523",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "mul_131",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_131",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "convert_element_type_447",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_168",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "dtype_cast_169",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_169",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "permute_206",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_447",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "alias_default_526",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_206",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "alias_default_527",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_526",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_527",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "einsum_default_130",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_130",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "alias_default_528",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_528",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "convert_element_type_450",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_450",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "alias_default_529",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_529",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "neg_18",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "exp_18",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "add_93",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_529",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_93",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "div_18",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "convert_element_type_451",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_170",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "dtype_cast_170",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_170",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "permute_207",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_207",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "alias_default_531",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_526",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_531",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "einsum_default_131",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_451",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "alias_default_530",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_131",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "alias_default_532",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_530",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_532",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "mul_132",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_169",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "dtype_cast_171",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "permute_208",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_132",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "alias_default_533",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_208",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "alias_default_534",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_533",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_534",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "einsum_default_132",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_522",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_132",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18",
-      "name": "add_94",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_180",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "dtype_cast_172",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_94",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18",
-      "name": "alias_default_535",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_535",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "convert_element_type_456",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_456",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "alias_default_537",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_537",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "pow_39",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_39",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "mean_38",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_38",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "add_95",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_95",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "rsqrt_38",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_38",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "alias_default_538",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_537",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_538",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "mul_133",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_172",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "alias_default_536",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_133",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_536",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "mul_134",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_134",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "convert_element_type_457",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_173",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wq",
-      "name": "dtype_cast_173",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_173",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wq",
-      "name": "permute_209",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_457",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "alias_default_539",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_209",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wq",
-      "name": "alias_default_540",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_539",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_540",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wq",
-      "name": "einsum_default_133",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_174",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wk",
-      "name": "dtype_cast_174",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_174",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wk",
-      "name": "permute_210",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_210",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wk",
-      "name": "alias_default_541",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_539",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_541",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wk",
-      "name": "einsum_default_134",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_175",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wv",
-      "name": "dtype_cast_175",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_175",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wv",
-      "name": "permute_211",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_211",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wv",
-      "name": "alias_default_542",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_539",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_542",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wv",
-      "name": "einsum_default_135",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_133",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_481",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_134",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_482",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_135",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_483",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_481",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "convert_element_type_464",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_464",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_484",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_484",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_as_complex_38",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_482",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "convert_element_type_465",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_465",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_485",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_485",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_as_complex_39",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_486",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_486",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "alias_default_543",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_543",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "mul_135",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_135",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_as_real_38",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_487",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_543",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "mul_136",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_136",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_as_real_39",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_488",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_487",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "convert_element_type_466",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_488",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "convert_element_type_467",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_467",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "unsqueeze_38",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "expand_38",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "clone_38",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_489",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_483",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "unsqueeze_39",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "expand_39",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "clone_39",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_490",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_466",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "permute_212",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_489",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "permute_213",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_490",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "permute_214",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_212",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "alias_default_544",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_213",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "alias_default_545",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_214",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "alias_default_546",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_544",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_545",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_546",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_19",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.sdpa",
-      "name": "getitem_171",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.sdpa",
-      "name": "getitem_172",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_19",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.sdpa",
-      "name": "getitem_177",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_19",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.sdpa",
-      "name": "getitem_178",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.sdpa",
-      "name": "alias_default_547",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_547",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "permute_215",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_215",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_491",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_176",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "dtype_cast_176",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_176",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "permute_216",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_491",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "alias_default_548",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_216",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "alias_default_549",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_548",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_549",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "einsum_default_136",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_535",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_136",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19",
-      "name": "add_96",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_181",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "dtype_cast_177",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_96",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19",
-      "name": "alias_default_550",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_550",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "convert_element_type_470",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_470",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "alias_default_552",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_552",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "pow_40",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_40",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "mean_39",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_39",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "add_97",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_97",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "rsqrt_39",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_39",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "alias_default_553",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_552",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_553",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "mul_137",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_177",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "alias_default_551",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_137",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_551",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "mul_138",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_138",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "convert_element_type_471",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_177",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "dtype_cast_178",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_178",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "permute_217",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_471",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "alias_default_554",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_217",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "alias_default_555",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_554",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_555",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "einsum_default_137",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_137",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "alias_default_556",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_556",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "convert_element_type_474",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_474",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "alias_default_557",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_557",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "neg_19",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "exp_19",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "add_98",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_557",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_98",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "div_19",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "convert_element_type_475",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_179",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "dtype_cast_179",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_179",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "permute_218",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_218",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "alias_default_559",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_554",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_559",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "einsum_default_138",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_475",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "alias_default_558",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_138",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "alias_default_560",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_558",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_560",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "mul_139",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_178",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "dtype_cast_180",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_180",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "permute_219",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_139",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "alias_default_561",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_219",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "alias_default_562",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_561",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_562",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "einsum_default_139",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_550",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_139",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19",
-      "name": "add_99",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_189",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "dtype_cast_181",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_99",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19",
-      "name": "alias_default_563",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_563",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "convert_element_type_480",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_480",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "alias_default_565",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_565",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "pow_41",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_41",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "mean_40",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_40",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "add_100",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_100",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "rsqrt_40",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_40",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "alias_default_566",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_565",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_566",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "mul_140",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_181",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "alias_default_564",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_140",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_564",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "mul_141",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_141",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "convert_element_type_481",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_182",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wq",
-      "name": "dtype_cast_182",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_182",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wq",
-      "name": "permute_220",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_481",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "alias_default_567",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_220",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wq",
-      "name": "alias_default_568",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_567",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_568",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wq",
-      "name": "einsum_default_140",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_183",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wk",
-      "name": "dtype_cast_183",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_183",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wk",
-      "name": "permute_221",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_221",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wk",
-      "name": "alias_default_569",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_567",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_569",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wk",
-      "name": "einsum_default_141",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_184",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wv",
-      "name": "dtype_cast_184",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_184",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wv",
-      "name": "permute_222",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_222",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wv",
-      "name": "alias_default_570",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_567",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_570",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wv",
-      "name": "einsum_default_142",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_140",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_506",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_141",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_507",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_142",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_508",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_506",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "convert_element_type_488",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_488",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_509",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_509",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_as_complex_40",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_507",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "convert_element_type_489",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_489",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_510",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_510",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_as_complex_41",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_511",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_511",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "alias_default_571",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_571",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "mul_142",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_142",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_as_real_40",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_512",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_41",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_571",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "mul_143",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_143",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_as_real_41",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_41",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_513",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_512",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "convert_element_type_490",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_513",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "convert_element_type_491",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_491",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "unsqueeze_40",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "expand_40",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "clone_40",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_514",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_508",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "unsqueeze_41",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_41",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "expand_41",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_41",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "clone_41",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_41",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_515",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_490",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "permute_223",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_514",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "permute_224",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_515",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "permute_225",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_223",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "alias_default_572",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_224",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "alias_default_573",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_225",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "alias_default_574",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_572",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_573",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_574",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_20",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.sdpa",
-      "name": "getitem_180",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.sdpa",
-      "name": "getitem_181",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_20",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.sdpa",
-      "name": "getitem_186",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_20",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.sdpa",
-      "name": "getitem_187",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_180",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.sdpa",
-      "name": "alias_default_575",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_575",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "permute_226",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_226",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_516",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_185",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "dtype_cast_185",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_185",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "permute_227",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_516",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "alias_default_576",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_227",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "alias_default_577",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_576",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_577",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "einsum_default_143",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_563",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_143",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20",
-      "name": "add_101",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_190",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "dtype_cast_186",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_101",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20",
-      "name": "alias_default_578",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_578",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "convert_element_type_494",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_494",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "alias_default_580",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_580",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "pow_42",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_42",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "mean_41",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_41",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "add_102",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_102",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "rsqrt_41",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_41",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "alias_default_581",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_580",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_581",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "mul_144",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_186",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "alias_default_579",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_144",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_579",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "mul_145",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_145",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "convert_element_type_495",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_186",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "dtype_cast_187",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_187",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "permute_228",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_495",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "alias_default_582",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_228",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "alias_default_583",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_582",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_583",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "einsum_default_144",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_144",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "alias_default_584",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_584",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "convert_element_type_498",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_498",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "alias_default_585",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_585",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "neg_20",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "exp_20",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "add_103",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_585",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_103",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "div_20",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "convert_element_type_499",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_188",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "dtype_cast_188",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_188",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "permute_229",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_229",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "alias_default_587",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_582",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_587",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "einsum_default_145",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_499",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "alias_default_586",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_145",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "alias_default_588",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_586",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_588",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "mul_146",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_187",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "dtype_cast_189",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_189",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "permute_230",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_146",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "alias_default_589",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_230",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "alias_default_590",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_589",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_590",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "einsum_default_146",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_578",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_146",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20",
-      "name": "add_104",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_198",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "dtype_cast_190",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_104",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20",
-      "name": "alias_default_591",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_591",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "convert_element_type_504",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_504",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "alias_default_593",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_593",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "pow_43",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_43",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "mean_42",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_42",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "add_105",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_105",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "rsqrt_42",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_42",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "alias_default_594",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_593",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_594",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "mul_147",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_190",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "alias_default_592",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_147",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_592",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "mul_148",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_148",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "convert_element_type_505",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_191",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wq",
-      "name": "dtype_cast_191",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_191",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wq",
-      "name": "permute_231",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_505",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "alias_default_595",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_231",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wq",
-      "name": "alias_default_596",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_595",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_596",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wq",
-      "name": "einsum_default_147",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_192",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wk",
-      "name": "dtype_cast_192",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_192",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wk",
-      "name": "permute_232",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_232",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wk",
-      "name": "alias_default_597",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_595",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_597",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wk",
-      "name": "einsum_default_148",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_193",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wv",
-      "name": "dtype_cast_193",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_193",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wv",
-      "name": "permute_233",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_233",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wv",
-      "name": "alias_default_598",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_595",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_598",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wv",
-      "name": "einsum_default_149",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_147",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_531",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_148",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_532",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_149",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_533",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_531",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "convert_element_type_512",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_512",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_534",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_534",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_as_complex_42",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_532",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "convert_element_type_513",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_513",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_535",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_535",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_as_complex_43",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_536",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_536",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "alias_default_599",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_599",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "mul_149",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_149",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_as_real_42",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_537",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_599",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "mul_150",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_150",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_as_real_43",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_538",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_537",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "convert_element_type_514",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_538",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "convert_element_type_515",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_515",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "unsqueeze_42",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "expand_42",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "clone_42",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_539",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_533",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "unsqueeze_43",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "expand_43",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "clone_43",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_540",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_514",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "permute_234",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_539",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "permute_235",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_540",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "permute_236",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_234",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "alias_default_600",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_235",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "alias_default_601",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_236",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "alias_default_602",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_600",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_601",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_602",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_21",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.sdpa",
-      "name": "getitem_189",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.sdpa",
-      "name": "getitem_190",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_21",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.sdpa",
-      "name": "getitem_195",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_21",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.sdpa",
-      "name": "getitem_196",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_189",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.sdpa",
-      "name": "alias_default_603",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_603",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "permute_237",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_237",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_541",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_194",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "dtype_cast_194",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_194",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "permute_238",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_541",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "alias_default_604",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_238",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "alias_default_605",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_604",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_605",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "einsum_default_150",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_591",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_150",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21",
-      "name": "add_106",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_199",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "dtype_cast_195",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_106",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21",
-      "name": "alias_default_606",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_606",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "convert_element_type_518",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_518",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "alias_default_608",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_608",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "pow_44",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_44",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "mean_43",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_43",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "add_107",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_107",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "rsqrt_43",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_43",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "alias_default_609",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_608",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_609",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "mul_151",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_195",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "alias_default_607",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_151",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_607",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "mul_152",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_152",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "convert_element_type_519",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_195",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "dtype_cast_196",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_196",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "permute_239",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_519",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "alias_default_610",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_239",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "alias_default_611",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_610",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_611",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "einsum_default_151",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_151",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "alias_default_612",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_612",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "convert_element_type_522",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_522",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "alias_default_613",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_613",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "neg_21",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "exp_21",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "add_108",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_613",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_108",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "div_21",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "convert_element_type_523",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_197",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "dtype_cast_197",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_197",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "permute_240",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_240",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "alias_default_615",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_610",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_615",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "einsum_default_152",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_523",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "alias_default_614",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_152",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "alias_default_616",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_614",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_616",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "mul_153",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_196",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "dtype_cast_198",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_198",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "permute_241",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_153",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "alias_default_617",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_241",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "alias_default_618",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_617",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_618",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "einsum_default_153",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_606",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_153",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21",
-      "name": "add_109",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_207",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "dtype_cast_199",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_109",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21",
-      "name": "alias_default_619",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_619",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "convert_element_type_528",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_528",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "alias_default_621",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_621",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "pow_45",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_45",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "mean_44",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_44",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "add_110",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_110",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "rsqrt_44",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_44",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "alias_default_622",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_621",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_622",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "mul_154",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_199",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "alias_default_620",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_154",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_620",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "mul_155",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_155",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "convert_element_type_529",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_200",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wq",
-      "name": "dtype_cast_200",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_200",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wq",
-      "name": "permute_242",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_529",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "alias_default_623",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_242",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wq",
-      "name": "alias_default_624",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_623",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_624",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wq",
-      "name": "einsum_default_154",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_201",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wk",
-      "name": "dtype_cast_201",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_201",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wk",
-      "name": "permute_243",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_243",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wk",
-      "name": "alias_default_625",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_623",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_625",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wk",
-      "name": "einsum_default_155",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_202",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wv",
-      "name": "dtype_cast_202",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_202",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wv",
-      "name": "permute_244",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_244",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wv",
-      "name": "alias_default_626",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_623",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_626",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wv",
-      "name": "einsum_default_156",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_154",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_556",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_155",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_557",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_156",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_558",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_556",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "convert_element_type_536",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_536",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_559",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_559",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_as_complex_44",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_557",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "convert_element_type_537",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_537",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_560",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_560",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_as_complex_45",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_561",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_561",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "alias_default_627",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_44",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_627",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "mul_156",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_156",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_as_real_44",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_44",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_562",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_45",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_627",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "mul_157",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_157",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_as_real_45",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_45",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_563",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_562",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "convert_element_type_538",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_563",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "convert_element_type_539",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_539",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "unsqueeze_44",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_44",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "expand_44",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_44",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "clone_44",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_44",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_564",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_558",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "unsqueeze_45",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_45",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "expand_45",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_45",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "clone_45",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_45",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_565",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_538",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "permute_245",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_564",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "permute_246",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_565",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "permute_247",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_245",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "alias_default_628",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_246",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "alias_default_629",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_247",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "alias_default_630",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_628",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_629",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_630",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_22",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_22",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.sdpa",
-      "name": "getitem_198",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_22",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.sdpa",
-      "name": "getitem_199",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_22",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.sdpa",
-      "name": "getitem_204",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_22",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.sdpa",
-      "name": "getitem_205",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_198",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.sdpa",
-      "name": "alias_default_631",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_631",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "permute_248",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_248",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_566",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_203",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "dtype_cast_203",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_203",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "permute_249",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_566",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "alias_default_632",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_249",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "alias_default_633",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_632",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_633",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "einsum_default_157",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_619",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_157",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22",
-      "name": "add_111",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_208",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "dtype_cast_204",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_111",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22",
-      "name": "alias_default_634",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_634",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "convert_element_type_542",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_542",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "alias_default_636",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_636",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "pow_46",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_46",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "mean_45",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_45",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "add_112",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_112",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "rsqrt_45",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_45",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "alias_default_637",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_636",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_637",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "mul_158",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_204",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "alias_default_635",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_158",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_635",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "mul_159",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_159",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "convert_element_type_543",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_204",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "dtype_cast_205",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_205",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "permute_250",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_543",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "alias_default_638",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_250",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "alias_default_639",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_638",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_639",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "einsum_default_158",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_158",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "alias_default_640",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_640",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "convert_element_type_546",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_546",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "alias_default_641",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_641",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "neg_22",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "exp_22",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "add_113",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_641",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_113",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "div_22",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "convert_element_type_547",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_206",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "dtype_cast_206",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_206",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "permute_251",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_251",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "alias_default_643",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_638",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_643",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "einsum_default_159",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_547",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "alias_default_642",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_159",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "alias_default_644",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_642",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_644",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "mul_160",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_205",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "dtype_cast_207",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_207",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "permute_252",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_160",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "alias_default_645",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_252",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "alias_default_646",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_645",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_646",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "einsum_default_160",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_634",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_160",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22",
-      "name": "add_114",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_216",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "dtype_cast_208",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_114",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22",
-      "name": "alias_default_647",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_647",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "convert_element_type_552",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_552",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "alias_default_649",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_649",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "pow_47",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_47",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "mean_46",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_46",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "add_115",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "rsqrt_46",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_46",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "alias_default_650",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_649",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_650",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "mul_161",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_208",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "alias_default_648",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_161",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_648",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "mul_162",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_162",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "convert_element_type_553",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_209",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wq",
-      "name": "dtype_cast_209",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_209",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wq",
-      "name": "permute_253",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_553",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "alias_default_651",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_253",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wq",
-      "name": "alias_default_652",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_651",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_652",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wq",
-      "name": "einsum_default_161",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_210",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wk",
-      "name": "dtype_cast_210",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_210",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wk",
-      "name": "permute_254",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_254",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wk",
-      "name": "alias_default_653",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_651",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_653",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wk",
-      "name": "einsum_default_162",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_211",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wv",
-      "name": "dtype_cast_211",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_211",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wv",
-      "name": "permute_255",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_255",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wv",
-      "name": "alias_default_654",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_651",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_654",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wv",
-      "name": "einsum_default_163",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_161",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_581",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_162",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_582",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_163",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_583",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_581",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "convert_element_type_560",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_560",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_584",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_584",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_as_complex_46",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_582",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "convert_element_type_561",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_561",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_585",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_585",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_as_complex_47",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_586",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_586",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "alias_default_655",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_46",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_655",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "mul_163",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_163",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_as_real_46",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_46",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_587",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_47",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_655",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "mul_164",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_164",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_as_real_47",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_47",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_588",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_587",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "convert_element_type_562",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_588",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "convert_element_type_563",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_563",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "unsqueeze_46",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_46",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "expand_46",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_46",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "clone_46",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_46",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_589",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_583",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "unsqueeze_47",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_47",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "expand_47",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_47",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "clone_47",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_47",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_590",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_562",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "permute_256",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_589",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "permute_257",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_590",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "permute_258",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_256",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "alias_default_656",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_257",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "alias_default_657",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_258",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "alias_default_658",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_656",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_657",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_658",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_23",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_23",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.sdpa",
-      "name": "getitem_207",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_23",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.sdpa",
-      "name": "getitem_208",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_23",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.sdpa",
-      "name": "getitem_213",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_23",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.sdpa",
-      "name": "getitem_214",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_207",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.sdpa",
-      "name": "alias_default_659",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_659",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "permute_259",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_259",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_591",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_212",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "dtype_cast_212",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_212",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "permute_260",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_591",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "alias_default_660",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_260",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "alias_default_661",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_660",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_661",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "einsum_default_164",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_647",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_164",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23",
-      "name": "add_116",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_217",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "dtype_cast_213",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_116",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23",
-      "name": "alias_default_662",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_662",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "convert_element_type_566",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_566",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "alias_default_664",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_664",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "pow_48",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_48",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "mean_47",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_47",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "add_117",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_117",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "rsqrt_47",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_47",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "alias_default_665",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_664",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_665",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "mul_165",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_213",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "alias_default_663",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_165",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_663",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "mul_166",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_166",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "convert_element_type_567",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_213",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "dtype_cast_214",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_214",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "permute_261",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_567",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "alias_default_666",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_261",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "alias_default_667",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_666",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_667",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "einsum_default_165",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_165",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "alias_default_668",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_668",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "convert_element_type_570",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_570",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "alias_default_669",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_669",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "neg_23",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "exp_23",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "add_118",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_669",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_118",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "div_23",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "convert_element_type_571",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_215",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "dtype_cast_215",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_215",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "permute_262",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_262",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "alias_default_671",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_666",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_671",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "einsum_default_166",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_571",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "alias_default_670",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_166",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "alias_default_672",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_670",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_672",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "mul_167",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_214",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "dtype_cast_216",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_216",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "permute_263",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_167",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "alias_default_673",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_263",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "alias_default_674",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_673",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_674",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "einsum_default_167",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_662",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_167",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23",
-      "name": "add_119",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_225",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "dtype_cast_217",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_119",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23",
-      "name": "alias_default_675",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_675",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "convert_element_type_576",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_576",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "alias_default_677",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_677",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "pow_49",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "mean_48",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_48",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "add_120",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_120",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "rsqrt_48",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_48",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "alias_default_678",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_677",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_678",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "mul_168",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_217",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "alias_default_676",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_168",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_676",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "mul_169",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_169",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "convert_element_type_577",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_218",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wq",
-      "name": "dtype_cast_218",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_218",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wq",
-      "name": "permute_264",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_577",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "alias_default_679",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_264",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wq",
-      "name": "alias_default_680",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_679",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_680",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wq",
-      "name": "einsum_default_168",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_219",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wk",
-      "name": "dtype_cast_219",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_219",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wk",
-      "name": "permute_265",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_265",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wk",
-      "name": "alias_default_681",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_679",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_681",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wk",
-      "name": "einsum_default_169",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_220",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wv",
-      "name": "dtype_cast_220",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_220",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wv",
-      "name": "permute_266",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_266",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wv",
-      "name": "alias_default_682",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_679",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_682",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wv",
-      "name": "einsum_default_170",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_168",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_606",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_169",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_607",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_170",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_608",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_606",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "convert_element_type_584",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_584",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_609",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_609",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_as_complex_48",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_607",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "convert_element_type_585",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_585",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_610",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_610",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_as_complex_49",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_611",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_611",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "alias_default_683",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_683",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "mul_170",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_170",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_as_real_48",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_612",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_49",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_683",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "mul_171",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_171",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_as_real_49",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_49",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_613",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_612",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "convert_element_type_586",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_613",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "convert_element_type_587",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_587",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "unsqueeze_48",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "expand_48",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "clone_48",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_614",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_608",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "unsqueeze_49",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_49",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "expand_49",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_49",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "clone_49",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_49",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_615",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_586",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "permute_267",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_614",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "permute_268",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_615",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "permute_269",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_267",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "alias_default_684",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_268",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "alias_default_685",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_269",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "alias_default_686",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_684",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_685",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_686",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_24",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.sdpa",
-      "name": "getitem_216",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.sdpa",
-      "name": "getitem_217",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_24",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.sdpa",
-      "name": "getitem_222",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_24",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.sdpa",
-      "name": "getitem_223",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_216",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.sdpa",
-      "name": "alias_default_687",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_687",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "permute_270",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_270",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_616",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_221",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "dtype_cast_221",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_221",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "permute_271",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_616",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "alias_default_688",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_271",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "alias_default_689",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_688",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_689",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "einsum_default_171",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_675",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24",
-      "name": "add_121",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_226",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "dtype_cast_222",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_121",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24",
-      "name": "alias_default_690",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_690",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "convert_element_type_590",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_590",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "alias_default_692",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_692",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "pow_50",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_50",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "mean_49",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "add_122",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_122",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "rsqrt_49",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "alias_default_693",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_692",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_693",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "mul_172",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_222",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "alias_default_691",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_172",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_691",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "mul_173",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_173",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "convert_element_type_591",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_222",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "dtype_cast_223",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_223",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "permute_272",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_591",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "alias_default_694",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_272",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "alias_default_695",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_694",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_695",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "einsum_default_172",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_172",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "alias_default_696",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_696",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "convert_element_type_594",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_594",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "alias_default_697",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_697",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "neg_24",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "exp_24",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "add_123",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_697",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_123",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "div_24",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "convert_element_type_595",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_224",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "dtype_cast_224",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_224",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "permute_273",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_273",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "alias_default_699",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_694",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_699",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "einsum_default_173",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_595",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "alias_default_698",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_173",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "alias_default_700",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_698",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_700",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "mul_174",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_223",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "dtype_cast_225",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_225",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "permute_274",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_174",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "alias_default_701",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_274",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "alias_default_702",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_701",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_702",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "einsum_default_174",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_690",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_174",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24",
-      "name": "add_124",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_234",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "dtype_cast_226",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_124",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24",
-      "name": "alias_default_703",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_703",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "convert_element_type_600",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_600",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "alias_default_705",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_705",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "pow_51",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_51",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "mean_50",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_50",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "add_125",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_125",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "rsqrt_50",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_50",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "alias_default_706",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_705",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_706",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "mul_175",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_226",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "alias_default_704",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_175",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_704",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "mul_176",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_176",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "convert_element_type_601",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_227",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wq",
-      "name": "dtype_cast_227",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_227",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wq",
-      "name": "permute_275",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_601",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "alias_default_707",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_275",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wq",
-      "name": "alias_default_708",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_707",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_708",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wq",
-      "name": "einsum_default_175",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_228",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wk",
-      "name": "dtype_cast_228",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_228",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wk",
-      "name": "permute_276",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_276",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wk",
-      "name": "alias_default_709",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_707",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_709",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wk",
-      "name": "einsum_default_176",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_229",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wv",
-      "name": "dtype_cast_229",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_229",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wv",
-      "name": "permute_277",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_277",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wv",
-      "name": "alias_default_710",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_707",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_710",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wv",
-      "name": "einsum_default_177",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_175",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_631",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_176",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_632",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_177",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_633",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_631",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "convert_element_type_608",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_608",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_634",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_634",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_as_complex_50",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_632",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "convert_element_type_609",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_609",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_635",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_635",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_as_complex_51",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_636",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_636",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "alias_default_711",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_50",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_711",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "mul_177",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_177",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_as_real_50",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_50",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_637",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_51",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_711",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "mul_178",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_178",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_as_real_51",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_51",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_638",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_637",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "convert_element_type_610",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_638",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "convert_element_type_611",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_611",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "unsqueeze_50",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_50",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "expand_50",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_50",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "clone_50",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_50",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_639",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_633",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "unsqueeze_51",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_51",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "expand_51",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_51",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "clone_51",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_51",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_640",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_610",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "permute_278",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_639",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "permute_279",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_640",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "permute_280",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_278",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "alias_default_712",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_279",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "alias_default_713",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_280",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "alias_default_714",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_712",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_713",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_714",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_25",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.sdpa",
-      "name": "getitem_225",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.sdpa",
-      "name": "getitem_226",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_25",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.sdpa",
-      "name": "getitem_231",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_25",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.sdpa",
-      "name": "getitem_232",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_225",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.sdpa",
-      "name": "alias_default_715",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_715",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "permute_281",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_281",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_641",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_230",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "dtype_cast_230",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_230",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "permute_282",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_641",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "alias_default_716",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_282",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "alias_default_717",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_716",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_717",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "einsum_default_178",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_703",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_178",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25",
-      "name": "add_126",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_235",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "dtype_cast_231",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_126",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25",
-      "name": "alias_default_718",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_718",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "convert_element_type_614",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_614",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "alias_default_720",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_720",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "pow_52",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_52",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "mean_51",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_51",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "add_127",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_127",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "rsqrt_51",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_51",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "alias_default_721",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_720",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_721",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "mul_179",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_231",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "alias_default_719",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_179",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_719",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "mul_180",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_180",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "convert_element_type_615",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_231",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "dtype_cast_232",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_232",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "permute_283",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_615",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "alias_default_722",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_283",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "alias_default_723",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_722",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_723",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "einsum_default_179",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_179",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "alias_default_724",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_724",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "convert_element_type_618",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_618",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "alias_default_725",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_725",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "neg_25",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "exp_25",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "add_128",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_725",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_128",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "div_25",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "convert_element_type_619",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_233",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "dtype_cast_233",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_233",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "permute_284",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_284",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "alias_default_727",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_722",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_727",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "einsum_default_180",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_619",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "alias_default_726",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_180",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "alias_default_728",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_726",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_728",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "mul_181",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_232",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "dtype_cast_234",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_234",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "permute_285",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_181",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "alias_default_729",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_285",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "alias_default_730",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_729",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_730",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "einsum_default_181",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_718",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_181",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25",
-      "name": "add_129",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_243",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "dtype_cast_235",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_129",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25",
-      "name": "alias_default_731",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_731",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "convert_element_type_624",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_624",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "alias_default_733",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_733",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "pow_53",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_53",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "mean_52",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_52",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "add_130",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_130",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "rsqrt_52",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_52",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "alias_default_734",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_733",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_734",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "mul_182",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_235",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "alias_default_732",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_182",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_732",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "mul_183",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_183",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "convert_element_type_625",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_236",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wq",
-      "name": "dtype_cast_236",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_236",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wq",
-      "name": "permute_286",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_625",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "alias_default_735",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_286",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wq",
-      "name": "alias_default_736",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_735",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_736",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wq",
-      "name": "einsum_default_182",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_237",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wk",
-      "name": "dtype_cast_237",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_237",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wk",
-      "name": "permute_287",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_287",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wk",
-      "name": "alias_default_737",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_735",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_737",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wk",
-      "name": "einsum_default_183",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_238",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wv",
-      "name": "dtype_cast_238",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_238",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wv",
-      "name": "permute_288",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_288",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wv",
-      "name": "alias_default_738",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_735",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_738",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wv",
-      "name": "einsum_default_184",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_182",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_656",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_183",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_657",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_184",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_658",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_656",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "convert_element_type_632",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_632",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_659",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_659",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_as_complex_52",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_657",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "convert_element_type_633",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_633",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_660",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_660",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_as_complex_53",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_661",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_661",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "alias_default_739",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_739",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "mul_184",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_184",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_as_real_52",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_662",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_739",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "mul_185",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_185",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_as_real_53",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_663",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_662",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "convert_element_type_634",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_663",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "convert_element_type_635",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_635",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "unsqueeze_52",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "expand_52",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "clone_52",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_664",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_658",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "unsqueeze_53",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "expand_53",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "clone_53",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_665",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_634",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "permute_289",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_664",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "permute_290",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_665",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "permute_291",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_289",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "alias_default_740",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_290",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "alias_default_741",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_291",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "alias_default_742",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_740",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_741",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_742",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_26",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.sdpa",
-      "name": "getitem_234",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.sdpa",
-      "name": "getitem_235",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_26",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.sdpa",
-      "name": "getitem_240",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_26",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.sdpa",
-      "name": "getitem_241",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_234",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.sdpa",
-      "name": "alias_default_743",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_743",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "permute_292",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_292",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_666",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_239",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "dtype_cast_239",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_239",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "permute_293",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_666",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "alias_default_744",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_293",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "alias_default_745",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_744",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_745",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "einsum_default_185",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_731",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_185",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26",
-      "name": "add_131",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_244",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "dtype_cast_240",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_131",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26",
-      "name": "alias_default_746",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_746",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "convert_element_type_638",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_638",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "alias_default_748",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_748",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "pow_54",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_54",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "mean_53",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_53",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "add_132",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_132",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "rsqrt_53",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_53",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "alias_default_749",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_748",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_749",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "mul_186",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_240",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "alias_default_747",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_186",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_747",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "mul_187",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_187",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "convert_element_type_639",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_240",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "dtype_cast_241",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_241",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "permute_294",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_639",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "alias_default_750",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_294",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "alias_default_751",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_750",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_751",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "einsum_default_186",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_186",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "alias_default_752",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_752",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "convert_element_type_642",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_642",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "alias_default_753",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_753",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "neg_26",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "exp_26",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "add_133",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_753",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_133",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "div_26",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "convert_element_type_643",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_242",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "dtype_cast_242",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_242",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "permute_295",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_295",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "alias_default_755",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_750",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_755",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "einsum_default_187",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_643",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "alias_default_754",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_187",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "alias_default_756",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_754",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_756",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "mul_188",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_241",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "dtype_cast_243",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_243",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "permute_296",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_188",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "alias_default_757",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_296",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "alias_default_758",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_757",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_758",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "einsum_default_188",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_746",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_188",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26",
-      "name": "add_134",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_252",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "dtype_cast_244",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_134",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26",
-      "name": "alias_default_759",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_759",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "convert_element_type_648",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_648",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "alias_default_761",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_761",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "pow_55",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_55",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "mean_54",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_54",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "add_135",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_135",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "rsqrt_54",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_54",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "alias_default_762",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_761",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_762",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "mul_189",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_244",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "alias_default_760",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_189",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_760",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "mul_190",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_190",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "convert_element_type_649",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_245",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wq",
-      "name": "dtype_cast_245",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_245",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wq",
-      "name": "permute_297",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_649",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "alias_default_763",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_297",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wq",
-      "name": "alias_default_764",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_763",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_764",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wq",
-      "name": "einsum_default_189",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_246",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wk",
-      "name": "dtype_cast_246",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_246",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wk",
-      "name": "permute_298",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_298",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wk",
-      "name": "alias_default_765",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_763",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_765",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wk",
-      "name": "einsum_default_190",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_247",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wv",
-      "name": "dtype_cast_247",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_247",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wv",
-      "name": "permute_299",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_299",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wv",
-      "name": "alias_default_766",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_763",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_766",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wv",
-      "name": "einsum_default_191",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_189",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_681",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_190",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_682",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_191",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_683",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_681",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "convert_element_type_656",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_656",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_684",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_684",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_as_complex_54",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_682",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "convert_element_type_657",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_657",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_685",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_685",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_as_complex_55",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_686",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_686",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "alias_default_767",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_767",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "mul_191",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_191",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_as_real_54",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_687",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_55",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_767",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "mul_192",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_192",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_as_real_55",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_55",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_688",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_687",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "convert_element_type_658",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_688",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "convert_element_type_659",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_659",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "unsqueeze_54",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "expand_54",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "clone_54",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_689",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_683",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "unsqueeze_55",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_55",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "expand_55",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_55",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "clone_55",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_55",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_690",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_658",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "permute_300",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_689",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "permute_301",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_690",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "permute_302",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_300",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "alias_default_768",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_301",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "alias_default_769",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_302",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "alias_default_770",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_768",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_769",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_770",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_27",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.sdpa",
-      "name": "getitem_243",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.sdpa",
-      "name": "getitem_244",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_27",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.sdpa",
-      "name": "getitem_249",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_27",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.sdpa",
-      "name": "getitem_250",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_243",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.sdpa",
-      "name": "alias_default_771",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_771",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "permute_303",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_303",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_691",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_248",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "dtype_cast_248",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_248",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "permute_304",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_691",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "alias_default_772",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_304",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "alias_default_773",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_772",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_773",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "einsum_default_192",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_759",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_192",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27",
-      "name": "add_136",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_253",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "dtype_cast_249",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_136",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27",
-      "name": "alias_default_774",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_774",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "convert_element_type_662",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_662",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "alias_default_776",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_776",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "pow_56",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_56",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "mean_55",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_55",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "add_137",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_137",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "rsqrt_55",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_55",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "alias_default_777",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_776",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_777",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "mul_193",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_249",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "alias_default_775",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_193",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_775",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "mul_194",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_194",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "convert_element_type_663",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_249",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "dtype_cast_250",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_250",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "permute_305",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_663",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "alias_default_778",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_305",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "alias_default_779",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_778",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_779",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "einsum_default_193",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_193",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "alias_default_780",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_780",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "convert_element_type_666",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_666",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "alias_default_781",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_781",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "neg_27",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "exp_27",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "add_138",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_781",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_138",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "div_27",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "convert_element_type_667",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_251",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "dtype_cast_251",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_251",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "permute_306",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_306",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "alias_default_783",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_778",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_783",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "einsum_default_194",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_667",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "alias_default_782",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_194",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "alias_default_784",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_782",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_784",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "mul_195",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_250",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "dtype_cast_252",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_252",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "permute_307",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_195",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "alias_default_785",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_307",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "alias_default_786",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_785",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_786",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "einsum_default_195",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_774",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_195",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27",
-      "name": "add_139",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_261",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "dtype_cast_253",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_139",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27",
-      "name": "alias_default_787",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_787",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "convert_element_type_672",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_672",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "alias_default_789",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_789",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "pow_57",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_57",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "mean_56",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_56",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "add_140",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_140",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "rsqrt_56",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_56",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "alias_default_790",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_789",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_790",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "mul_196",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_253",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "alias_default_788",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_196",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_788",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "mul_197",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_197",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "convert_element_type_673",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_254",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wq",
-      "name": "dtype_cast_254",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_254",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wq",
-      "name": "permute_308",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_673",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "alias_default_791",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_308",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wq",
-      "name": "alias_default_792",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_791",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_792",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wq",
-      "name": "einsum_default_196",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_255",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wk",
-      "name": "dtype_cast_255",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_255",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wk",
-      "name": "permute_309",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_309",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wk",
-      "name": "alias_default_793",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_791",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_793",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wk",
-      "name": "einsum_default_197",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_256",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wv",
-      "name": "dtype_cast_256",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_256",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wv",
-      "name": "permute_310",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_310",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wv",
-      "name": "alias_default_794",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_791",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_794",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wv",
-      "name": "einsum_default_198",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_196",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_706",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_197",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_707",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_198",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_708",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_706",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "convert_element_type_680",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_680",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_709",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_709",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_as_complex_56",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_707",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "convert_element_type_681",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_681",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_710",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_710",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_as_complex_57",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_711",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_711",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "alias_default_795",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_795",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "mul_198",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_198",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_as_real_56",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_712",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_795",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "mul_199",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_199",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_as_real_57",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_713",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_712",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "convert_element_type_682",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_713",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "convert_element_type_683",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_683",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "unsqueeze_56",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "expand_56",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "clone_56",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_714",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_708",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "unsqueeze_57",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "expand_57",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "clone_57",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_715",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_682",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "permute_311",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_714",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "permute_312",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_715",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "permute_313",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_311",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "alias_default_796",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_312",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "alias_default_797",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_313",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "alias_default_798",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_796",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_797",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_798",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_28",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_28",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.sdpa",
-      "name": "getitem_252",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_28",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.sdpa",
-      "name": "getitem_253",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_28",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.sdpa",
-      "name": "getitem_258",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_28",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.sdpa",
-      "name": "getitem_259",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_252",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.sdpa",
-      "name": "alias_default_799",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_799",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "permute_314",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_314",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_716",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_257",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "dtype_cast_257",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_257",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "permute_315",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_716",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "alias_default_800",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_315",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "alias_default_801",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_800",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_801",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "einsum_default_199",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_787",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_199",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28",
-      "name": "add_141",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_262",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "dtype_cast_258",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_141",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28",
-      "name": "alias_default_802",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_802",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "convert_element_type_686",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_686",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "alias_default_804",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_804",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "pow_58",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_58",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "mean_57",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_57",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "add_142",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_142",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "rsqrt_57",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_57",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "alias_default_805",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_804",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_805",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "mul_200",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_258",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "alias_default_803",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_200",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_803",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "mul_201",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_201",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "convert_element_type_687",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_258",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "dtype_cast_259",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_259",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "permute_316",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_687",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "alias_default_806",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_316",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "alias_default_807",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_806",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_807",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "einsum_default_200",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_200",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "alias_default_808",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_808",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "convert_element_type_690",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_690",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "alias_default_809",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_809",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "neg_28",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "exp_28",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "add_143",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_809",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_143",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "div_28",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "convert_element_type_691",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_260",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "dtype_cast_260",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_260",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "permute_317",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_317",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "alias_default_811",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_806",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_811",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "einsum_default_201",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_691",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "alias_default_810",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_201",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "alias_default_812",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_810",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_812",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "mul_202",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_259",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "dtype_cast_261",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_261",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "permute_318",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_202",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "alias_default_813",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_318",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "alias_default_814",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_813",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_814",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "einsum_default_202",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_802",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_202",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28",
-      "name": "add_144",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_270",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "dtype_cast_262",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_144",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28",
-      "name": "alias_default_815",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_815",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "convert_element_type_696",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_696",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "alias_default_817",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_817",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "pow_59",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "mean_58",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_58",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "add_145",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_145",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "rsqrt_58",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_58",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "alias_default_818",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_817",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_818",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "mul_203",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_262",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "alias_default_816",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_203",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_816",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "mul_204",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_204",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "convert_element_type_697",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_263",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wq",
-      "name": "dtype_cast_263",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_263",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wq",
-      "name": "permute_319",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_697",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "alias_default_819",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_319",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wq",
-      "name": "alias_default_820",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_819",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_820",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wq",
-      "name": "einsum_default_203",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_264",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wk",
-      "name": "dtype_cast_264",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_264",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wk",
-      "name": "permute_320",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_320",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wk",
-      "name": "alias_default_821",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_819",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_821",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wk",
-      "name": "einsum_default_204",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_265",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wv",
-      "name": "dtype_cast_265",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_265",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wv",
-      "name": "permute_321",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_321",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wv",
-      "name": "alias_default_822",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_819",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_822",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wv",
-      "name": "einsum_default_205",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_203",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_731",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_204",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_732",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_205",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_733",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_731",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "convert_element_type_704",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_704",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_734",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_734",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_as_complex_58",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_732",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "convert_element_type_705",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_705",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_735",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_735",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_as_complex_59",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_736",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_736",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "alias_default_823",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_823",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "mul_205",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_205",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_as_real_58",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_737",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_823",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "mul_206",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_206",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_as_real_59",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_738",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_737",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "convert_element_type_706",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_738",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "convert_element_type_707",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_707",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "unsqueeze_58",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "expand_58",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "clone_58",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_739",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_733",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "unsqueeze_59",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "expand_59",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "clone_59",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_740",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_706",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "permute_322",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_739",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "permute_323",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_740",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "permute_324",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_322",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "alias_default_824",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_323",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "alias_default_825",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_324",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "alias_default_826",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_824",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_825",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_826",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_29",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.sdpa",
-      "name": "getitem_261",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.sdpa",
-      "name": "getitem_262",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_29",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.sdpa",
-      "name": "getitem_267",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_29",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.sdpa",
-      "name": "getitem_268",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_261",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.sdpa",
-      "name": "alias_default_827",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_827",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "permute_325",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_325",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_741",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_266",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "dtype_cast_266",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_266",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "permute_326",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_741",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "alias_default_828",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_326",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "alias_default_829",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_828",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_829",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "einsum_default_206",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_815",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_206",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29",
-      "name": "add_146",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_271",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "dtype_cast_267",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_146",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29",
-      "name": "alias_default_830",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_830",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "convert_element_type_710",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_710",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "alias_default_832",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_832",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "pow_60",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_60",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "mean_59",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "add_147",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_147",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "rsqrt_59",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "alias_default_833",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_832",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_833",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "mul_207",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_267",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "alias_default_831",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_207",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_831",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "mul_208",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_208",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "convert_element_type_711",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_267",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "dtype_cast_268",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_268",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "permute_327",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_711",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "alias_default_834",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_327",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "alias_default_835",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_834",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_835",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "einsum_default_207",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_207",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "alias_default_836",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_836",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "convert_element_type_714",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_714",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "alias_default_837",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_837",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "neg_29",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "exp_29",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "add_148",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_837",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_148",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "div_29",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "convert_element_type_715",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_269",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "dtype_cast_269",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_269",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "permute_328",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_328",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "alias_default_839",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_834",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_839",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "einsum_default_208",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_715",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "alias_default_838",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_208",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "alias_default_840",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_838",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_840",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "mul_209",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_268",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "dtype_cast_270",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_270",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "permute_329",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_209",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "alias_default_841",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_329",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "alias_default_842",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_841",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_842",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "einsum_default_209",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_830",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_209",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29",
-      "name": "add_149",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_279",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "dtype_cast_271",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_149",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29",
-      "name": "alias_default_843",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_843",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "convert_element_type_720",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_720",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "alias_default_845",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_845",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "pow_61",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_61",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "mean_60",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_60",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "add_150",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_150",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "rsqrt_60",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_60",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "alias_default_846",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_845",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_846",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "mul_210",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_271",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "alias_default_844",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_210",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_844",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "mul_211",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_211",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "convert_element_type_721",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_272",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wq",
-      "name": "dtype_cast_272",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_272",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wq",
-      "name": "permute_330",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_721",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "alias_default_847",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_330",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wq",
-      "name": "alias_default_848",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_847",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_848",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wq",
-      "name": "einsum_default_210",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_273",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wk",
-      "name": "dtype_cast_273",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_273",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wk",
-      "name": "permute_331",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_331",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wk",
-      "name": "alias_default_849",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_847",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_849",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wk",
-      "name": "einsum_default_211",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_274",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wv",
-      "name": "dtype_cast_274",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_274",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wv",
-      "name": "permute_332",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_332",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wv",
-      "name": "alias_default_850",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_847",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_850",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wv",
-      "name": "einsum_default_212",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_210",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_756",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_211",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_757",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_212",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_758",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_756",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "convert_element_type_728",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_728",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_759",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_759",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_as_complex_60",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_757",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "convert_element_type_729",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_729",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_760",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_760",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_as_complex_61",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_761",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_761",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "alias_default_851",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_851",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "mul_212",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_212",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_as_real_60",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_762",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_851",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "mul_213",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_213",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_as_real_61",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_763",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_762",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "convert_element_type_730",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_763",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "convert_element_type_731",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_731",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "unsqueeze_60",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "expand_60",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "clone_60",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_764",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_758",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "unsqueeze_61",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "expand_61",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "clone_61",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_765",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_730",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "permute_333",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_764",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "permute_334",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_765",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "permute_335",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_333",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "alias_default_852",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_334",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "alias_default_853",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_335",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "alias_default_854",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_852",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_853",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_854",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_30",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_30",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.sdpa",
-      "name": "getitem_270",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_30",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.sdpa",
-      "name": "getitem_271",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_30",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.sdpa",
-      "name": "getitem_276",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_30",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.sdpa",
-      "name": "getitem_277",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_270",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.sdpa",
-      "name": "alias_default_855",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_855",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "permute_336",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_336",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_766",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_275",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "dtype_cast_275",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_275",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "permute_337",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_766",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "alias_default_856",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_337",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "alias_default_857",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_856",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_857",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "einsum_default_213",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_843",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_213",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30",
-      "name": "add_151",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_280",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "dtype_cast_276",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_151",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30",
-      "name": "alias_default_858",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_858",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "convert_element_type_734",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_734",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "alias_default_860",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_860",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "pow_62",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_62",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "mean_61",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_61",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "add_152",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_152",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "rsqrt_61",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_61",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "alias_default_861",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_860",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_861",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "mul_214",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_276",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "alias_default_859",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_214",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_859",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "mul_215",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_215",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "convert_element_type_735",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_276",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "dtype_cast_277",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_277",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "permute_338",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_735",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "alias_default_862",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_338",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "alias_default_863",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_862",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_863",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "einsum_default_214",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_214",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "alias_default_864",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_864",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "convert_element_type_738",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_738",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "alias_default_865",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_865",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "neg_30",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "exp_30",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "add_153",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_865",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_153",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "div_30",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "convert_element_type_739",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_278",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "dtype_cast_278",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_278",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "permute_339",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_339",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "alias_default_867",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_862",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_867",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "einsum_default_215",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_739",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "alias_default_866",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_215",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "alias_default_868",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_866",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_868",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "mul_216",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_277",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "dtype_cast_279",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_279",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "permute_340",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_216",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "alias_default_869",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_340",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "alias_default_870",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_869",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_870",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "einsum_default_216",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_858",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_216",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30",
-      "name": "add_154",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 0,
-      "cluster_root": "dtype_cast_1",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_288",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "dtype_cast_280",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_154",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30",
-      "name": "alias_default_871",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 1,
-      "cluster_root": "convert_element_type",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_871",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "convert_element_type_744",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 2,
-      "cluster_root": "alias_default_5",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_744",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "alias_default_873",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 3,
-      "cluster_root": "pow_1",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_873",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "pow_63",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 4,
-      "cluster_root": "mean",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_63",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "mean_62",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 5,
-      "cluster_root": "add",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_62",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "add_155",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 6,
-      "cluster_root": "rsqrt",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_155",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "rsqrt_62",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 7,
-      "cluster_root": "alias_default_6",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_62",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "alias_default_874",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 8,
-      "cluster_root": "mul",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_873",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_874",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "mul_217",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 9,
-      "cluster_root": "alias_default_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_280",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "alias_default_872",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 10,
-      "cluster_root": "mul_1",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_217",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_872",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "mul_218",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 11,
-      "cluster_root": "convert_element_type_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_218",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "convert_element_type_745",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 12,
-      "cluster_root": "dtype_cast_2",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_281",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wq",
-      "name": "dtype_cast_281",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 13,
-      "cluster_root": "permute",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 93.01059422750424,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_281",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wq",
-      "name": "permute_341",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 14,
-      "cluster_root": "alias_default_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_745",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "alias_default_875",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 15,
-      "cluster_root": "alias_default_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_341",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wq",
-      "name": "alias_default_876",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 16,
-      "cluster_root": "einsum_default",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_875",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_876",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wq",
-      "name": "einsum_default_217",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 17,
-      "cluster_root": "dtype_cast_3",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_282",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wk",
-      "name": "dtype_cast_282",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 18,
-      "cluster_root": "permute_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 75.93123841862722,
-          "dst_placement": "RR",
-          "name": "dtype_cast_282",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wk",
-      "name": "permute_342",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 19,
-      "cluster_root": "alias_default_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_342",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wk",
-      "name": "alias_default_877",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 20,
-      "cluster_root": "einsum_default_1",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_875",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_877",
-          "src_placement": "RR",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wk",
-      "name": "einsum_default_218",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 21,
-      "cluster_root": "dtype_cast_4",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_283",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wv",
-      "name": "dtype_cast_283",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 22,
-      "cluster_root": "permute_2",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 39.60264855687606,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_283",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wv",
-      "name": "permute_343",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 23,
-      "cluster_root": "alias_default_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_343",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wv",
-      "name": "alias_default_878",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 24,
-      "cluster_root": "einsum_default_2",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_875",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_878",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wv",
-      "name": "einsum_default_219",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 25,
-      "cluster_root": "view_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_217",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_781",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 26,
-      "cluster_root": "view_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_218",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_782",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 27,
-      "cluster_root": "view_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_219",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_783",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 28,
-      "cluster_root": "convert_element_type_8",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_781",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "convert_element_type_752",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 29,
-      "cluster_root": "view_9",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_752",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_784",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 30,
-      "cluster_root": "view_as_complex",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_784",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_as_complex_62",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 31,
-      "cluster_root": "convert_element_type_9",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_782",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "convert_element_type_753",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 32,
-      "cluster_root": "view_10",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_753",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_785",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 33,
-      "cluster_root": "view_as_complex_1",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_785",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_as_complex_63",
-      "op": "aten.view_as_complex.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 34,
-      "cluster_root": "view_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_786",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 35,
-      "cluster_root": "alias_default_11",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "view_786",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "alias_default_879",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "return freqs_cis.view(*shape)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "reshape_for_broadcast",
-        "line": 183
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 36,
-      "cluster_root": "mul_2",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_62",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_879",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "mul_219",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 37,
-      "cluster_root": "view_as_real",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_219",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_as_real_62",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 38,
-      "cluster_root": "view_12",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_62",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_787",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 39,
-      "cluster_root": "mul_3",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_879",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "mul_220",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 40,
-      "cluster_root": "view_as_real_1",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_220",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_as_real_63",
-      "op": "aten.view_as_real.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 41,
-      "cluster_root": "view_13",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_788",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 42,
-      "cluster_root": "convert_element_type_10",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_787",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "convert_element_type_754",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 43,
-      "cluster_root": "convert_element_type_11",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_788",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "convert_element_type_755",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 44,
-      "cluster_root": "unsqueeze",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_755",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "unsqueeze_62",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 45,
-      "cluster_root": "expand",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_62",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "expand_62",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 46,
-      "cluster_root": "clone",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_62",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "clone_62",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 47,
-      "cluster_root": "view_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_62",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_789",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 48,
-      "cluster_root": "unsqueeze_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_783",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "unsqueeze_63",
-      "op": "aten.unsqueeze.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 49,
-      "cluster_root": "expand_1",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "unsqueeze_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "expand_63",
-      "op": "aten.expand.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 50,
-      "cluster_root": "clone_1",
-      "compute_cost": 26.027785181236673,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "expand_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "clone_63",
-      "op": "aten.clone.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 51,
-      "cluster_root": "view_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "clone_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_790",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 52,
-      "cluster_root": "permute_3",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_754",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "permute_344",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 53,
-      "cluster_root": "permute_4",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_789",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "permute_345",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 54,
-      "cluster_root": "permute_5",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_790",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "permute_346",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 55,
-      "cluster_root": "alias_default_12",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_344",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "alias_default_880",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 56,
-      "cluster_root": "alias_default_13",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_345",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "alias_default_881",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 57,
-      "cluster_root": "alias_default_14",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_346",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "alias_default_882",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 58,
-      "cluster_root": "_scaled_dot_product_flash_attention",
-      "compute_cost": 794.1005545110502,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_880",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_881",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_882",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_31",
-      "op": "aten._scaled_dot_product_flash_attention.default",
-      "phase": "forward",
-      "placement": "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 59,
-      "cluster_root": "getitem",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.sdpa",
-      "name": "getitem_279",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.sdpa",
-      "name": "getitem_280",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_31",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.sdpa",
-      "name": "getitem_285",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        2
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "uint64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_scaled_dot_product_flash_attention_31",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.sdpa",
-      "name": "getitem_286",
-      "op": "<built-in function getitem>",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 60,
-      "cluster_root": "alias_default_15",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_279",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.sdpa",
-      "name": "alias_default_883",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 61,
-      "cluster_root": "permute_6",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_883",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "permute_347",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 62,
-      "cluster_root": "view_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_347",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_791",
-      "op": "aten.view.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 63,
-      "cluster_root": "dtype_cast_5",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_284",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "dtype_cast_284",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 64,
-      "cluster_root": "permute_7",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 187.32495367450883,
-          "dst_placement": "RR",
-          "name": "dtype_cast_284",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "permute_348",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 65,
-      "cluster_root": "alias_default_16",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "view_791",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "alias_default_884",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 66,
-      "cluster_root": "alias_default_17",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_348",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "alias_default_885",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 67,
-      "cluster_root": "einsum_default_3",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_884",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_885",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "einsum_default_220",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 68,
-      "cluster_root": "add_1",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_871",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_220",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31",
-      "name": "add_156",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 69,
-      "cluster_root": "dtype_cast_6",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_289",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "dtype_cast_285",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 70,
-      "cluster_root": "alias_default_18",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_156",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31",
-      "name": "alias_default_886",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "h = x + self.attention(self.attention_norm(x), freqs_cis)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 419
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 71,
-      "cluster_root": "convert_element_type_14",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_886",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "convert_element_type_758",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 72,
-      "cluster_root": "alias_default_20",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_758",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "alias_default_888",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 73,
-      "cluster_root": "pow_2",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_888",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "pow_64",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 74,
-      "cluster_root": "mean_1",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_64",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "mean_63",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 75,
-      "cluster_root": "add_2",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_63",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "add_157",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 76,
-      "cluster_root": "rsqrt_1",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_157",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "rsqrt_63",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 77,
-      "cluster_root": "alias_default_21",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_63",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "alias_default_889",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 78,
-      "cluster_root": "mul_4",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_888",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_889",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "mul_221",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 79,
-      "cluster_root": "alias_default_19",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_285",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "alias_default_887",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 80,
-      "cluster_root": "mul_5",
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_221",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_887",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "mul_222",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 81,
-      "cluster_root": "convert_element_type_15",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_222",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "convert_element_type_759",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 82,
-      "cluster_root": "dtype_cast_7",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_285",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "dtype_cast_286",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 83,
-      "cluster_root": "permute_8",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_286",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "permute_349",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 84,
-      "cluster_root": "alias_default_22",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_759",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "alias_default_890",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 85,
-      "cluster_root": "alias_default_23",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_349",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "alias_default_891",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 86,
-      "cluster_root": "einsum_default_4",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_890",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_891",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "einsum_default_221",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 87,
-      "cluster_root": "alias_default_24",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_221",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "alias_default_892",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 88,
-      "cluster_root": "convert_element_type_18",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_892",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "convert_element_type_762",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 89,
-      "cluster_root": "alias_default_25",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_762",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "alias_default_893",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 90,
-      "cluster_root": "neg",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_893",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "neg_31",
-      "op": "aten.neg.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 91,
-      "cluster_root": "exp",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "exp_31",
-      "op": "aten.exp.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 92,
-      "cluster_root": "add_3",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "add_158",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 93,
-      "cluster_root": "div",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_893",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_158",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "div_31",
-      "op": "aten.div.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 94,
-      "cluster_root": "convert_element_type_19",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "div_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "convert_element_type_763",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 95,
-      "cluster_root": "dtype_cast_8",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_287",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "dtype_cast_287",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 96,
-      "cluster_root": "permute_9",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_287",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "permute_350",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 97,
-      "cluster_root": "alias_default_27",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_350",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "alias_default_895",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 98,
-      "cluster_root": "einsum_default_5",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_890",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_895",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "einsum_default_222",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 99,
-      "cluster_root": "alias_default_26",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_763",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "alias_default_894",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 100,
-      "cluster_root": "alias_default_28",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_222",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "alias_default_896",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 101,
-      "cluster_root": "mul_6",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_894",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_896",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "mul_223",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 102,
-      "cluster_root": "dtype_cast_9",
-      "compute_cost": 8.540367012593283,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "primals_286",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "dtype_cast_288",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 103,
-      "cluster_root": "permute_10",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 258.576,
-          "dst_placement": "RS(1)",
-          "name": "dtype_cast_288",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "permute_351",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 104,
-      "cluster_root": "alias_default_29",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_223",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "alias_default_897",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 105,
-      "cluster_root": "alias_default_30",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_351",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "alias_default_898",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 106,
-      "cluster_root": "einsum_default_6",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_897",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_898",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "einsum_default_223",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 107,
-      "cluster_root": "add_4",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_886",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_223",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31",
-      "name": "add_159",
-      "op": "aten.add.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_290",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "dtype_cast_289",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 108,
-      "cluster_root": "alias_default_31",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_159",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31",
-      "name": "alias_default_899",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "out = h + self.feed_forward(self.ffn_norm(h))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 420
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_899",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "convert_element_type_768",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_768",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "alias_default_901",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_901",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "pow_65",
-      "op": "aten.pow.Tensor_Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "pow_65",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "mean_64",
-      "op": "aten.mean.dim",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mean_64",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "add_160",
-      "op": "aten.add.Scalar",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_160",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "rsqrt_64",
-      "op": "aten.rsqrt.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "rsqrt_64",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "alias_default_902",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_901",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_902",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "mul_224",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 28.358260191421483,
-          "dst_placement": "RR",
-          "name": "dtype_cast_289",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "alias_default_900",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 52.058747582344104,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_224",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_900",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "mul_225",
-      "op": "aten.mul.Tensor",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_225",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "convert_element_type_769",
-      "op": "prims.convert_element_type.default",
-      "phase": "forward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 76.40578345195063,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(0)",
-          "name": "primals_291",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "dtype_cast_290",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "forward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 2081.296,
-          "dst_placement": "RS(0)",
-          "name": "dtype_cast_290",
-          "src_placement": "S(0)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "permute_352",
-      "op": "aten.permute.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        128256
-      ],
-      "source": {
-        "code": "output = self.output(h) if self.output else h",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 545
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_769",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "alias_default_903",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_352",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "alias_default_904",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        128256
-      ],
-      "source": {
-        "code": "output = self.output(h) if self.output else h",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 545
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 6216.318403281814,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_903",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_904",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "einsum_default_224",
-      "op": "aten.einsum.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        128256
-      ],
-      "source": {
-        "code": "output = self.output(h) if self.output else h",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 545
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_224",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "alias_default_1420",
-      "op": "aten.alias.default",
-      "phase": "forward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        128256
-      ],
-      "source": {
-        "code": "output = self.output(h) if self.output else h",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 545
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "tangents_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "alias_default_2",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        128256
-      ],
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 6216.318403281814,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_903",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "einsum_default_225",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        128256
-      ],
-      "source": {
-        "code": "output = self.output(h) if self.output else h",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 545
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_904",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "permute_355",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "source": {
-        "code": "output = self.output(h) if self.output else h",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 545
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 6216.318403281814,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_355",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "einsum_default_226",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "output = self.output(h) if self.output else h",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 545
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_225",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "permute_356",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "source": {
-        "code": "output = self.output(h) if self.output else h",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 545
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 305.6231338078025,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_356",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "dtype_cast_291",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 4133.392,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_291",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].output",
-      "name": "alias_default_1711",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_226",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "convert_element_type_776",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_899",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "convert_element_type_777",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_900",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "convert_element_type_778",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_776",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "alias_default_905",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_905",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_778",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "mul_226",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_777",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_902",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "mul_227",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_226",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "alias_default_906",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_227",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "alias_default_907",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_907",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_906",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "mul_228",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_228",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "sum_1",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_907",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "div_32",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_32",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "mul_229",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_906",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_229",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "sub",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_902",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "mul_230",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_905",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_907",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "mul_231",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_231",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "sum_2",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_230",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "convert_element_type_779",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_2",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "convert_element_type_780",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_780",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "dtype_cast_292",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_292",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "alias_default_1710",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "convert_element_type_779",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].norm",
-      "name": "alias_default_908",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_908",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_897",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "einsum_default_227",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_898",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "permute_359",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_908",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_359",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "einsum_default_228",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_227",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "permute_360",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_360",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "dtype_cast_293",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_293",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "alias_default_1706",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_228",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w2",
-      "name": "alias_default_909",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_909",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_894",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "mul_232",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_909",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_896",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "mul_233",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_232",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "alias_default_910",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_910",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_890",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "einsum_default_229",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_895",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "permute_363",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_910",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_363",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "einsum_default_230",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_229",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "permute_364",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_364",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "dtype_cast_294",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_294",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w3",
-      "name": "alias_default_1707",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_233",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "convert_element_type_789",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_892",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "convert_element_type_790",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_790",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "alias_default_911",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_911",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "neg_32",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "exp_32",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "add_161",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_161",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "reciprocal",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "mul_234",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_234",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "alias_default_912",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_789",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_912",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "mul_235",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_912",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "sub_1",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_911",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "mul_236",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_236",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "add_162",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_235",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_162",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "mul_237",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_237",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "convert_element_type_791",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_791",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward",
-      "name": "alias_default_913",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_913",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_890",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "einsum_default_231",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_891",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "permute_367",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_913",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_367",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "einsum_default_232",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_230",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_232",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31",
-      "name": "add_163",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_231",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "permute_368",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_368",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "dtype_cast_295",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_295",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.feed_forward.w1",
-      "name": "alias_default_1705",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_163",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "convert_element_type_796",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_886",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "convert_element_type_797",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_887",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "convert_element_type_798",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_796",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "alias_default_914",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_914",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_798",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "mul_238",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_797",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_889",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "mul_239",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_238",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "alias_default_915",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_239",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "alias_default_916",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_916",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_915",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "mul_240",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_240",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "sum_3",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_916",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "div_33",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_33",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "mul_241",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_915",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_241",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "sub_2",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_2",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_889",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "mul_242",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_914",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_916",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "mul_243",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_243",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "sum_4",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_242",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "convert_element_type_799",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_4",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "convert_element_type_800",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_908",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_799",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "name": "add_164",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_800",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "dtype_cast_296",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_296",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.ffn_norm",
-      "name": "alias_default_1709",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_164",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "alias_default_917",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_917",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_884",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "einsum_default_233",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_885",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "permute_371",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_917",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_371",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "einsum_default_234",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_233",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "permute_372",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_372",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "dtype_cast_297",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_297",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wo",
-      "name": "alias_default_1704",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_234",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_812",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_812",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "permute_373",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_373",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_880",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_881",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_882",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_883",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_280",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_285",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_286",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.sdpa",
-      "name": "getitem_288",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.sdpa",
-      "name": "getitem_289",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.sdpa",
-      "name": "getitem_290",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_290",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "permute_374",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_289",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "permute_375",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_288",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "permute_376",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_374",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_813",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_813",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "sum_5",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "squeeze",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_375",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_814",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_814",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "sum_6",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "squeeze_1",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "convert_element_type_805",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_376",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "convert_element_type_806",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_805",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_815",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_815",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_as_complex_64",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_879",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "_conj",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "clone_70",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_64",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_70",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "mul_244",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_806",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_816",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_816",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_as_complex_65",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_879",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "_conj_1",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_1",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "clone_71",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_65",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_71",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "mul_245",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_244",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_as_real_64",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_64",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_817",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_817",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "convert_element_type_807",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_245",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_as_real_65",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_65",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_818",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_818",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "convert_element_type_808",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_819",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_807",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_820",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_808",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "view_821",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_819",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "alias_default_918",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_918",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_875",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wv",
-      "name": "einsum_default_235",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_878",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wv",
-      "name": "permute_379",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_918",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_379",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wv",
-      "name": "einsum_default_236",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_235",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wv",
-      "name": "permute_380",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_380",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wv",
-      "name": "dtype_cast_298",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_298",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wv",
-      "name": "alias_default_1703",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_820",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "alias_default_919",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_919",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_875",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wk",
-      "name": "einsum_default_237",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_877",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wk",
-      "name": "permute_383",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_919",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_383",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wk",
-      "name": "einsum_default_238",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_236",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_238",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "add_165",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_237",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wk",
-      "name": "permute_384",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_384",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wk",
-      "name": "dtype_cast_299",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_299",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wk",
-      "name": "alias_default_1702",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_821",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention",
-      "name": "alias_default_920",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_920",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_875",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wq",
-      "name": "einsum_default_239",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_876",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wq",
-      "name": "permute_387",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_920",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_387",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wq",
-      "name": "einsum_default_240",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_165",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_240",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31",
-      "name": "add_166",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_239",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wq",
-      "name": "permute_388",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_388",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wq",
-      "name": "dtype_cast_300",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_300",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention.wq",
-      "name": "alias_default_1701",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_166",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "convert_element_type_821",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_871",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "convert_element_type_822",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_872",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "convert_element_type_823",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_821",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "alias_default_921",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_921",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_823",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "mul_246",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_822",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_874",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "mul_247",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_246",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "alias_default_922",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_247",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "alias_default_923",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_923",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_922",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "mul_248",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_248",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "sum_7",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_923",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "div_34",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_34",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "mul_249",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_922",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_249",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "sub_3",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_874",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "mul_250",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_921",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_923",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "mul_251",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_251",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "sum_8",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_250",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "convert_element_type_824",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_8",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "convert_element_type_825",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_917",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_824",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "add_167",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_825",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "dtype_cast_301",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_301",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.31.attention_norm",
-      "name": "alias_default_1708",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_167",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "alias_default_924",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_924",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_869",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "einsum_default_241",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_870",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "permute_391",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_924",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_391",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "einsum_default_242",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_241",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "permute_392",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_392",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "dtype_cast_302",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_302",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "alias_default_1697",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_242",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w2",
-      "name": "alias_default_925",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_925",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_866",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "mul_252",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_925",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_868",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "mul_253",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_252",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "alias_default_926",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_926",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_862",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "einsum_default_243",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_867",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "permute_395",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_926",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_395",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "einsum_default_244",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_243",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "permute_396",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_396",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "dtype_cast_303",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_303",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w3",
-      "name": "alias_default_1698",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_253",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "convert_element_type_834",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_864",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "convert_element_type_835",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_835",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "alias_default_927",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_927",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "neg_33",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "exp_33",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "add_168",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_168",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "reciprocal_1",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_1",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "mul_254",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_254",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "alias_default_928",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_834",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_928",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "mul_255",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_928",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "sub_4",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_927",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "mul_256",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_256",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "add_169",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_255",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_169",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "mul_257",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_257",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "convert_element_type_836",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_836",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward",
-      "name": "alias_default_929",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_929",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_862",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "einsum_default_245",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_863",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "permute_399",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_929",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_399",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "einsum_default_246",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_244",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_246",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30",
-      "name": "add_170",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_245",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "permute_400",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_400",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "dtype_cast_304",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_304",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.feed_forward.w1",
-      "name": "alias_default_1696",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_170",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "convert_element_type_841",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_858",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "convert_element_type_842",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_859",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "convert_element_type_843",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_841",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "alias_default_930",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_930",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_843",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "mul_258",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_842",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_861",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "mul_259",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_258",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "alias_default_931",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_259",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "alias_default_932",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_932",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_931",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "mul_260",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_260",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "sum_9",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_932",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "div_35",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_35",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "mul_261",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_931",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_261",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "sub_5",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_861",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "mul_262",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_930",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_932",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "mul_263",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_263",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "sum_10",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_262",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "convert_element_type_844",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_10",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "convert_element_type_845",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_924",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_844",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "add_171",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_845",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "dtype_cast_305",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_305",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.ffn_norm",
-      "name": "alias_default_1700",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "alias_default_933",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_933",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_856",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "einsum_default_247",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_857",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "permute_403",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_933",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_403",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "einsum_default_248",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_247",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "permute_404",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_404",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "dtype_cast_306",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_306",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wo",
-      "name": "alias_default_1695",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_248",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_836",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_836",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "permute_405",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_405",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_852",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_853",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_854",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_855",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_271",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_276",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_277",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_1",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.sdpa",
-      "name": "getitem_291",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.sdpa",
-      "name": "getitem_292",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.sdpa",
-      "name": "getitem_293",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_293",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "permute_406",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_292",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "permute_407",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_291",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "permute_408",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_406",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_837",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_837",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "sum_11",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "squeeze_2",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_407",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_838",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_838",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "sum_12",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "squeeze_3",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "convert_element_type_850",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_408",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "convert_element_type_851",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_850",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_839",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_839",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_as_complex_66",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_851",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "_conj_2",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_2",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "clone_78",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_66",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_78",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "mul_264",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_851",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_840",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_840",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_as_complex_67",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_851",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "_conj_3",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_3",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "clone_79",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_67",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_79",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "mul_265",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_264",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_as_real_66",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_66",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_841",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_841",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "convert_element_type_852",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_265",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_as_real_67",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_67",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_842",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_842",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "convert_element_type_853",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_843",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_852",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_844",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_853",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "view_845",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_843",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "alias_default_934",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_934",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_847",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wv",
-      "name": "einsum_default_249",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_850",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wv",
-      "name": "permute_411",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_934",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_411",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wv",
-      "name": "einsum_default_250",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_249",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wv",
-      "name": "permute_412",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_412",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wv",
-      "name": "dtype_cast_307",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_307",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wv",
-      "name": "alias_default_1694",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_844",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "alias_default_935",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_935",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_847",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wk",
-      "name": "einsum_default_251",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_849",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wk",
-      "name": "permute_415",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_935",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_415",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wk",
-      "name": "einsum_default_252",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_250",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_252",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "add_172",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_251",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wk",
-      "name": "permute_416",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_416",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wk",
-      "name": "dtype_cast_308",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_308",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wk",
-      "name": "alias_default_1693",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_845",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention",
-      "name": "alias_default_936",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_936",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_847",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wq",
-      "name": "einsum_default_253",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_848",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wq",
-      "name": "permute_419",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_936",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_419",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wq",
-      "name": "einsum_default_254",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_172",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_254",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30",
-      "name": "add_173",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_253",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wq",
-      "name": "permute_420",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_420",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wq",
-      "name": "dtype_cast_309",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_309",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention.wq",
-      "name": "alias_default_1692",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_173",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "convert_element_type_866",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_843",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "convert_element_type_867",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_844",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "convert_element_type_868",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_866",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "alias_default_937",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_937",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_868",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "mul_266",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_867",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_846",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "mul_267",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_266",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "alias_default_938",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_267",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "alias_default_939",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_939",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_938",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "mul_268",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_268",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "sum_13",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_939",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "div_36",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_36",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "mul_269",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_938",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_269",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "sub_6",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_846",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "mul_270",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_937",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_939",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "mul_271",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_271",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "sum_14",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_270",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "convert_element_type_869",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_14",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "convert_element_type_870",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_933",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_869",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "add_174",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_870",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "dtype_cast_310",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_310",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.30.attention_norm",
-      "name": "alias_default_1699",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_174",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "alias_default_940",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_940",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_841",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "einsum_default_255",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_842",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "permute_423",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_940",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_423",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "einsum_default_256",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_255",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "permute_424",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_424",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "dtype_cast_311",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_311",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "alias_default_1688",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_256",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w2",
-      "name": "alias_default_941",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_941",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_838",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "mul_272",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_941",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_840",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "mul_273",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_272",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "alias_default_942",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_942",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_834",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "einsum_default_257",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_839",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "permute_427",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_942",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_427",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "einsum_default_258",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_257",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "permute_428",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_428",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "dtype_cast_312",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_312",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w3",
-      "name": "alias_default_1689",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_273",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "convert_element_type_879",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_836",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "convert_element_type_880",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_880",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "alias_default_943",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_943",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "neg_34",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "exp_34",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "add_175",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_175",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "reciprocal_2",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_2",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "mul_274",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_274",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "alias_default_944",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_879",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_944",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "mul_275",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_944",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "sub_7",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_943",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "mul_276",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_276",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "add_176",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_275",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_176",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "mul_277",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_277",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "convert_element_type_881",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_881",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward",
-      "name": "alias_default_945",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_945",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_834",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "einsum_default_259",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_835",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "permute_431",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_945",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_431",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "einsum_default_260",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_258",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_260",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29",
-      "name": "add_177",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_259",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "permute_432",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_432",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "dtype_cast_313",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_313",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.feed_forward.w1",
-      "name": "alias_default_1687",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_177",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "convert_element_type_886",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_830",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "convert_element_type_887",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_831",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "convert_element_type_888",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_886",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "alias_default_946",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_946",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_888",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "mul_278",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_887",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_833",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "mul_279",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_278",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "alias_default_947",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_279",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "alias_default_948",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_948",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_947",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "mul_280",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_280",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "sum_15",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_948",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "div_37",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_37",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "mul_281",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_947",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_281",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "sub_8",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_8",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_833",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "mul_282",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_946",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_948",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "mul_283",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_283",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "sum_16",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_282",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "convert_element_type_889",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_16",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "convert_element_type_890",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_940",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_889",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "add_178",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_890",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "dtype_cast_314",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_314",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.ffn_norm",
-      "name": "alias_default_1691",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_178",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "alias_default_949",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_949",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_828",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "einsum_default_261",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_829",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "permute_435",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_949",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_435",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "einsum_default_262",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_261",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "permute_436",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_436",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "dtype_cast_315",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_315",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wo",
-      "name": "alias_default_1686",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_262",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_860",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_860",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "permute_437",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_437",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_824",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_825",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_826",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_827",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_262",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_267",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_268",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_2",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_2",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.sdpa",
-      "name": "getitem_294",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_2",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.sdpa",
-      "name": "getitem_295",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_2",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.sdpa",
-      "name": "getitem_296",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_296",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "permute_438",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_295",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "permute_439",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_294",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "permute_440",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_438",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_861",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_861",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "sum_17",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "squeeze_4",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_439",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_862",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_862",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "sum_18",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "squeeze_5",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "convert_element_type_895",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_440",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "convert_element_type_896",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_895",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_863",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_863",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_as_complex_68",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_823",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "_conj_4",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_4",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "clone_86",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_68",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_86",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "mul_284",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_896",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_864",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_864",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_as_complex_69",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_823",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "_conj_5",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_5",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "clone_87",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_69",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_87",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "mul_285",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_284",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_as_real_68",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_68",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_865",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_865",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "convert_element_type_897",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_285",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_as_real_69",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_69",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_866",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_866",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "convert_element_type_898",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_867",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_897",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_868",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_898",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "view_869",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_867",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "alias_default_950",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_950",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_819",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wv",
-      "name": "einsum_default_263",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_822",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wv",
-      "name": "permute_443",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_950",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_443",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wv",
-      "name": "einsum_default_264",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_263",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wv",
-      "name": "permute_444",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_444",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wv",
-      "name": "dtype_cast_316",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_316",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wv",
-      "name": "alias_default_1685",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_868",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "alias_default_951",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_951",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_819",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wk",
-      "name": "einsum_default_265",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_821",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wk",
-      "name": "permute_447",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_951",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_447",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wk",
-      "name": "einsum_default_266",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_264",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_266",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "add_179",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_265",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wk",
-      "name": "permute_448",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_448",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wk",
-      "name": "dtype_cast_317",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_317",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wk",
-      "name": "alias_default_1684",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_869",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention",
-      "name": "alias_default_952",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_952",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_819",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wq",
-      "name": "einsum_default_267",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_820",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wq",
-      "name": "permute_451",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_952",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_451",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wq",
-      "name": "einsum_default_268",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_179",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_268",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29",
-      "name": "add_180",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_267",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wq",
-      "name": "permute_452",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_452",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wq",
-      "name": "dtype_cast_318",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_318",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention.wq",
-      "name": "alias_default_1683",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_180",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "convert_element_type_911",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_815",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "convert_element_type_912",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_816",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "convert_element_type_913",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_911",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "alias_default_953",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_953",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_913",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "mul_286",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_912",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_818",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "mul_287",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_286",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "alias_default_954",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_287",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "alias_default_955",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_955",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_954",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "mul_288",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_288",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "sum_19",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_955",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "div_38",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_38",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "mul_289",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_954",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_289",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "sub_9",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_818",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "mul_290",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_953",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_955",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "mul_291",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_291",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "sum_20",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_290",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "convert_element_type_914",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_20",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "convert_element_type_915",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_949",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_914",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "add_181",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_915",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "dtype_cast_319",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_319",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.29.attention_norm",
-      "name": "alias_default_1690",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_181",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "alias_default_956",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_956",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_813",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "einsum_default_269",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_814",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "permute_455",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_956",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_455",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "einsum_default_270",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_269",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "permute_456",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_456",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "dtype_cast_320",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_320",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "alias_default_1679",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_270",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w2",
-      "name": "alias_default_957",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_957",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_810",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "mul_292",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_957",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_812",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "mul_293",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_292",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "alias_default_958",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_958",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_806",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "einsum_default_271",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_811",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "permute_459",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_958",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_459",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "einsum_default_272",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_271",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "permute_460",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_460",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "dtype_cast_321",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_321",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w3",
-      "name": "alias_default_1680",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_293",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "convert_element_type_924",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_808",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "convert_element_type_925",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_925",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "alias_default_959",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_959",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "neg_35",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "exp_35",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "add_182",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_182",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "reciprocal_3",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_3",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "mul_294",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_294",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "alias_default_960",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_924",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_960",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "mul_295",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_960",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "sub_10",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_959",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "mul_296",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_296",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "add_183",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_295",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_183",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "mul_297",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_297",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "convert_element_type_926",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_926",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward",
-      "name": "alias_default_961",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_961",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_806",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "einsum_default_273",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_807",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "permute_463",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_961",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_463",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "einsum_default_274",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_272",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_274",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28",
-      "name": "add_184",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_273",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "permute_464",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_464",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "dtype_cast_322",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_322",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.feed_forward.w1",
-      "name": "alias_default_1678",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_184",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "convert_element_type_931",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_802",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "convert_element_type_932",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_803",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "convert_element_type_933",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_931",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "alias_default_962",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_962",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_933",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "mul_298",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_932",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_805",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "mul_299",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_298",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "alias_default_963",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_299",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "alias_default_964",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_964",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_963",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "mul_300",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_300",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "sum_21",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_964",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "div_39",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_39",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "mul_301",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_963",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_301",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "sub_11",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_805",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "mul_302",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_962",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_964",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "mul_303",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_303",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "sum_22",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_302",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "convert_element_type_934",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_22",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "convert_element_type_935",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_956",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_934",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "add_185",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_935",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "dtype_cast_323",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_323",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.ffn_norm",
-      "name": "alias_default_1682",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_185",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "alias_default_965",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_965",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_800",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "einsum_default_275",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_801",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "permute_467",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_965",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_467",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "einsum_default_276",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_275",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "permute_468",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_468",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "dtype_cast_324",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_324",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wo",
-      "name": "alias_default_1677",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_276",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_884",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_884",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "permute_469",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_469",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_796",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_797",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_798",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_799",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_253",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_258",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_259",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_3",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.sdpa",
-      "name": "getitem_297",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.sdpa",
-      "name": "getitem_298",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.sdpa",
-      "name": "getitem_299",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_299",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "permute_470",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_298",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "permute_471",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_297",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "permute_472",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_470",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_885",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_885",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "sum_23",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "squeeze_6",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_471",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_886",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_886",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "sum_24",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "squeeze_7",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "convert_element_type_940",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_472",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "convert_element_type_941",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_940",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_887",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_887",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_as_complex_70",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_795",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "_conj_6",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_6",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "clone_94",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_70",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_94",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "mul_304",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_941",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_888",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_888",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_as_complex_71",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_795",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "_conj_7",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_7",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "clone_95",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_71",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_95",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "mul_305",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_304",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_as_real_70",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_70",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_889",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_889",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "convert_element_type_942",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_305",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_as_real_71",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_71",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_890",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_890",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "convert_element_type_943",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_891",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_942",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_892",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_943",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "view_893",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_891",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "alias_default_966",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_966",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_791",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wv",
-      "name": "einsum_default_277",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_794",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wv",
-      "name": "permute_475",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_966",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_475",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wv",
-      "name": "einsum_default_278",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_277",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wv",
-      "name": "permute_476",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_476",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wv",
-      "name": "dtype_cast_325",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_325",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wv",
-      "name": "alias_default_1676",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_892",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "alias_default_967",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_967",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_791",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wk",
-      "name": "einsum_default_279",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_793",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wk",
-      "name": "permute_479",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_967",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_479",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wk",
-      "name": "einsum_default_280",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_278",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_280",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "add_186",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_279",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wk",
-      "name": "permute_480",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_480",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wk",
-      "name": "dtype_cast_326",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_326",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wk",
-      "name": "alias_default_1675",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_893",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention",
-      "name": "alias_default_968",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_968",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_791",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wq",
-      "name": "einsum_default_281",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_792",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wq",
-      "name": "permute_483",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_968",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_483",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wq",
-      "name": "einsum_default_282",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_186",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_282",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28",
-      "name": "add_187",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_281",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wq",
-      "name": "permute_484",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_484",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wq",
-      "name": "dtype_cast_327",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_327",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention.wq",
-      "name": "alias_default_1674",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_187",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "convert_element_type_956",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_787",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "convert_element_type_957",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_788",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "convert_element_type_958",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_956",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "alias_default_969",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_969",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_958",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "mul_306",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_957",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_790",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "mul_307",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_306",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "alias_default_970",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_307",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "alias_default_971",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_971",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_970",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "mul_308",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_308",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "sum_25",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_971",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "div_40",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_40",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "mul_309",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_970",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_309",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "sub_12",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_790",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "mul_310",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_969",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_971",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "mul_311",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_311",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "sum_26",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_310",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "convert_element_type_959",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_26",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "convert_element_type_960",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_965",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_959",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "add_188",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_960",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "dtype_cast_328",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_328",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.28.attention_norm",
-      "name": "alias_default_1681",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_188",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "alias_default_972",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_972",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_785",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "einsum_default_283",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_786",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "permute_487",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_972",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_487",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "einsum_default_284",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_283",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "permute_488",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_488",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "dtype_cast_329",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_329",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "alias_default_1670",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_284",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w2",
-      "name": "alias_default_973",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_973",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_782",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "mul_312",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_973",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_784",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "mul_313",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_312",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "alias_default_974",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_974",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_778",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "einsum_default_285",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_783",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "permute_491",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_974",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_491",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "einsum_default_286",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_285",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "permute_492",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_492",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "dtype_cast_330",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_330",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w3",
-      "name": "alias_default_1671",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_313",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "convert_element_type_969",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_780",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "convert_element_type_970",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_970",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "alias_default_975",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_975",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "neg_36",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_36",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "exp_36",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_36",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "add_189",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_189",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "reciprocal_4",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_4",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "mul_314",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_314",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "alias_default_976",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_969",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_976",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "mul_315",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_976",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "sub_13",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_975",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "mul_316",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_316",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "add_190",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_315",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_190",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "mul_317",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_317",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "convert_element_type_971",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_971",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward",
-      "name": "alias_default_977",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_977",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_778",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "einsum_default_287",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_779",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "permute_495",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_977",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_495",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "einsum_default_288",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_286",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_288",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27",
-      "name": "add_191",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_287",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "permute_496",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_496",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "dtype_cast_331",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_331",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.feed_forward.w1",
-      "name": "alias_default_1669",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_191",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "convert_element_type_976",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_774",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "convert_element_type_977",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_775",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "convert_element_type_978",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_976",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "alias_default_978",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_978",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_978",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "mul_318",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_977",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_777",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "mul_319",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_318",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "alias_default_979",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_319",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "alias_default_980",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_980",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_979",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "mul_320",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_320",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "sum_27",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_980",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "div_41",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_41",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "mul_321",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_979",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_321",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "sub_14",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_777",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "mul_322",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_978",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_980",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "mul_323",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_323",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "sum_28",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_322",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "convert_element_type_979",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_28",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "convert_element_type_980",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_972",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_979",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "add_192",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_980",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "dtype_cast_332",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_332",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.ffn_norm",
-      "name": "alias_default_1673",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_192",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "alias_default_981",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_981",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_772",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "einsum_default_289",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_773",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "permute_499",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_981",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_499",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "einsum_default_290",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_289",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "permute_500",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_500",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "dtype_cast_333",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_333",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wo",
-      "name": "alias_default_1668",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_290",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_908",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_908",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "permute_501",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_501",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_768",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_769",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_770",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_771",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_244",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_249",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_250",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_4",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.sdpa",
-      "name": "getitem_300",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.sdpa",
-      "name": "getitem_301",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_4",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.sdpa",
-      "name": "getitem_302",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_302",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "permute_502",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_301",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "permute_503",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_300",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "permute_504",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_502",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_909",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_909",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "sum_29",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "squeeze_8",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_503",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_910",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_910",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "sum_30",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "squeeze_9",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "convert_element_type_985",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_504",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "convert_element_type_986",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_985",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_911",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_911",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_as_complex_72",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_767",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "_conj_8",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_8",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "clone_102",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_72",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_102",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "mul_324",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_986",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_912",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_912",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_as_complex_73",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_767",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "_conj_9",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_9",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "clone_103",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_73",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_103",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "mul_325",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_324",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_as_real_72",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_72",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_913",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_913",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "convert_element_type_987",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_325",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_as_real_73",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_73",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_914",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_914",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "convert_element_type_988",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_915",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_987",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_916",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_988",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "view_917",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_915",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "alias_default_982",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_982",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_763",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wv",
-      "name": "einsum_default_291",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_766",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wv",
-      "name": "permute_507",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_982",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_507",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wv",
-      "name": "einsum_default_292",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_291",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wv",
-      "name": "permute_508",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_508",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wv",
-      "name": "dtype_cast_334",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_334",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wv",
-      "name": "alias_default_1667",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_916",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "alias_default_983",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_983",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_763",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wk",
-      "name": "einsum_default_293",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_765",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wk",
-      "name": "permute_511",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_983",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_511",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wk",
-      "name": "einsum_default_294",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_292",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_294",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "add_193",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_293",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wk",
-      "name": "permute_512",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_512",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wk",
-      "name": "dtype_cast_335",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_335",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wk",
-      "name": "alias_default_1666",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_917",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention",
-      "name": "alias_default_984",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_984",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_763",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wq",
-      "name": "einsum_default_295",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_764",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wq",
-      "name": "permute_515",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_984",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_515",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wq",
-      "name": "einsum_default_296",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_193",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_296",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27",
-      "name": "add_194",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_295",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wq",
-      "name": "permute_516",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_516",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wq",
-      "name": "dtype_cast_336",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_336",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention.wq",
-      "name": "alias_default_1665",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_194",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "convert_element_type_1001",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_759",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "convert_element_type_1002",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_760",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "convert_element_type_1003",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1001",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "alias_default_985",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_985",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1003",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "mul_326",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1002",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_762",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "mul_327",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_326",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "alias_default_986",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_327",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "alias_default_987",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_987",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_986",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "mul_328",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_328",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "sum_31",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_987",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "div_42",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_42",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "mul_329",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_986",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_329",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "sub_15",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_762",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "mul_330",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_985",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_987",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "mul_331",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_331",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "sum_32",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_330",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "convert_element_type_1004",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_32",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "convert_element_type_1005",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_981",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1004",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "add_195",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1005",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "dtype_cast_337",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_337",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.27.attention_norm",
-      "name": "alias_default_1672",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_195",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "alias_default_988",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_988",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_757",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "einsum_default_297",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_758",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "permute_519",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_988",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_519",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "einsum_default_298",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_297",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "permute_520",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_520",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "dtype_cast_338",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_338",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "alias_default_1661",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_298",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w2",
-      "name": "alias_default_989",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_989",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_754",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "mul_332",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_989",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_756",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "mul_333",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_332",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "alias_default_990",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_990",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_750",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "einsum_default_299",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_755",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "permute_523",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_990",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_523",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "einsum_default_300",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_299",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "permute_524",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_524",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "dtype_cast_339",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_339",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w3",
-      "name": "alias_default_1662",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_333",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "convert_element_type_1014",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_752",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "convert_element_type_1015",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1015",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "alias_default_991",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_991",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "neg_37",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "exp_37",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "add_196",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_196",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "reciprocal_5",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_5",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "mul_334",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_334",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "alias_default_992",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1014",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_992",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "mul_335",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_992",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "sub_16",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_991",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "mul_336",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_336",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "add_197",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_335",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_197",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "mul_337",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_337",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "convert_element_type_1016",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1016",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward",
-      "name": "alias_default_993",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_993",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_750",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "einsum_default_301",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_751",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "permute_527",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_993",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_527",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "einsum_default_302",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_300",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_302",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26",
-      "name": "add_198",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_301",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "permute_528",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_528",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "dtype_cast_340",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_340",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.feed_forward.w1",
-      "name": "alias_default_1660",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_198",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "convert_element_type_1021",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_746",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "convert_element_type_1022",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_747",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "convert_element_type_1023",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1021",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "alias_default_994",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_994",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1023",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "mul_338",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1022",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_749",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "mul_339",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_338",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "alias_default_995",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_339",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "alias_default_996",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_996",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_995",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "mul_340",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_340",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "sum_33",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_996",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "div_43",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_43",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_33",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "mul_341",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_995",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_341",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "sub_17",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_749",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "mul_342",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_994",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_996",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "mul_343",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_343",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "sum_34",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_342",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "convert_element_type_1024",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_34",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "convert_element_type_1025",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_988",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1024",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "add_199",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1025",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "dtype_cast_341",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_341",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.ffn_norm",
-      "name": "alias_default_1664",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_199",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "alias_default_997",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_997",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_744",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "einsum_default_303",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_745",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "permute_531",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_997",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_531",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "einsum_default_304",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_303",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "permute_532",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_532",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "dtype_cast_342",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_342",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wo",
-      "name": "alias_default_1659",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_304",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_932",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_932",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "permute_533",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_533",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_740",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_741",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_742",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_743",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_235",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_240",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_241",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_5",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.sdpa",
-      "name": "getitem_303",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.sdpa",
-      "name": "getitem_304",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_5",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.sdpa",
-      "name": "getitem_305",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_305",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "permute_534",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_304",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "permute_535",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_303",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "permute_536",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_534",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_933",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_933",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "sum_35",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "squeeze_10",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_535",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_934",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_934",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "sum_36",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_36",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "squeeze_11",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "convert_element_type_1030",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_536",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "convert_element_type_1031",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1030",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_935",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_935",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_as_complex_74",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_739",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "_conj_10",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_10",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "clone_110",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_74",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_110",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "mul_344",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1031",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_936",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_936",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_as_complex_75",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_739",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "_conj_11",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_11",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "clone_111",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_75",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_111",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "mul_345",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_344",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_as_real_74",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_74",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_937",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_937",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "convert_element_type_1032",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_345",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_as_real_75",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_75",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_938",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_938",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "convert_element_type_1033",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_939",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1032",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_940",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1033",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "view_941",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_939",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "alias_default_998",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_998",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_735",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wv",
-      "name": "einsum_default_305",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_738",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wv",
-      "name": "permute_539",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_998",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_539",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wv",
-      "name": "einsum_default_306",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_305",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wv",
-      "name": "permute_540",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_540",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wv",
-      "name": "dtype_cast_343",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_343",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wv",
-      "name": "alias_default_1658",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_940",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "alias_default_999",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_999",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_735",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wk",
-      "name": "einsum_default_307",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_737",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wk",
-      "name": "permute_543",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_999",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_543",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wk",
-      "name": "einsum_default_308",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_306",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_308",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "add_200",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_307",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wk",
-      "name": "permute_544",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_544",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wk",
-      "name": "dtype_cast_344",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_344",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wk",
-      "name": "alias_default_1657",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_941",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention",
-      "name": "alias_default_1000",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1000",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_735",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wq",
-      "name": "einsum_default_309",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_736",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wq",
-      "name": "permute_547",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1000",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_547",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wq",
-      "name": "einsum_default_310",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_200",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_310",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26",
-      "name": "add_201",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_309",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wq",
-      "name": "permute_548",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_548",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wq",
-      "name": "dtype_cast_345",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_345",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention.wq",
-      "name": "alias_default_1656",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_201",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "convert_element_type_1046",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_731",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "convert_element_type_1047",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_732",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "convert_element_type_1048",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1046",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "alias_default_1001",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1001",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1048",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "mul_346",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1047",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_734",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "mul_347",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_346",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "alias_default_1002",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_347",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "alias_default_1003",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1003",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1002",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "mul_348",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_348",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "sum_37",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1003",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "div_44",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_44",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_37",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "mul_349",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1002",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_349",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "sub_18",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_734",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "mul_350",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1001",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1003",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "mul_351",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_351",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "sum_38",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_350",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "convert_element_type_1049",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_38",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "convert_element_type_1050",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_997",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1049",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "add_202",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1050",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "dtype_cast_346",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_346",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.26.attention_norm",
-      "name": "alias_default_1663",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_202",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "alias_default_1004",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1004",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_729",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "einsum_default_311",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_730",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "permute_551",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1004",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_551",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "einsum_default_312",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_311",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "permute_552",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_552",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "dtype_cast_347",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_347",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "alias_default_1652",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_312",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w2",
-      "name": "alias_default_1005",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1005",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_726",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "mul_352",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1005",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_728",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "mul_353",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_352",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "alias_default_1006",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1006",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_722",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "einsum_default_313",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_727",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "permute_555",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1006",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_555",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "einsum_default_314",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_313",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "permute_556",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_556",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "dtype_cast_348",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_348",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w3",
-      "name": "alias_default_1653",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_353",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "convert_element_type_1059",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_724",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "convert_element_type_1060",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1060",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "alias_default_1007",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1007",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "neg_38",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "exp_38",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "add_203",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_203",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "reciprocal_6",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_6",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "mul_354",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_354",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "alias_default_1008",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1059",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1008",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "mul_355",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1008",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "sub_19",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1007",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "mul_356",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_356",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "add_204",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_355",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_204",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "mul_357",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_357",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "convert_element_type_1061",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1061",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward",
-      "name": "alias_default_1009",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1009",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_722",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "einsum_default_315",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_723",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "permute_559",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1009",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_559",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "einsum_default_316",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_314",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_316",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25",
-      "name": "add_205",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_315",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "permute_560",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_560",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "dtype_cast_349",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_349",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.feed_forward.w1",
-      "name": "alias_default_1651",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_205",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "convert_element_type_1066",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_718",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "convert_element_type_1067",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_719",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "convert_element_type_1068",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1066",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "alias_default_1010",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1010",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1068",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "mul_358",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1067",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_721",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "mul_359",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_358",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "alias_default_1011",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_359",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "alias_default_1012",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1012",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1011",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "mul_360",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_360",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "sum_39",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1012",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "div_45",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_45",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_39",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "mul_361",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1011",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_361",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "sub_20",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_721",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "mul_362",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1010",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1012",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "mul_363",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_363",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "sum_40",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_362",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "convert_element_type_1069",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_40",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "convert_element_type_1070",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1004",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1069",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "add_206",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1070",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "dtype_cast_350",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_350",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.ffn_norm",
-      "name": "alias_default_1655",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_206",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "alias_default_1013",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1013",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_716",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "einsum_default_317",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_717",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "permute_563",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1013",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_563",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "einsum_default_318",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_317",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "permute_564",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_564",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "dtype_cast_351",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_351",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wo",
-      "name": "alias_default_1650",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_318",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_956",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_956",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "permute_565",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_565",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_712",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_713",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_714",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_715",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_226",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_231",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_232",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_6",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.sdpa",
-      "name": "getitem_306",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.sdpa",
-      "name": "getitem_307",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.sdpa",
-      "name": "getitem_308",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_308",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "permute_566",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_307",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "permute_567",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_306",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "permute_568",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_566",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_957",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_957",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "sum_41",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_41",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "squeeze_12",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_567",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_958",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_958",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "sum_42",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "squeeze_13",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "convert_element_type_1075",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_568",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "convert_element_type_1076",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1075",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_959",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_959",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_as_complex_76",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_711",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "_conj_12",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_12",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "clone_118",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_76",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_118",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "mul_364",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1076",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_960",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_960",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_as_complex_77",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_711",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "_conj_13",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_13",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "clone_119",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_77",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_119",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "mul_365",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_364",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_as_real_76",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_76",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_961",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_961",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "convert_element_type_1077",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_365",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_as_real_77",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_77",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_962",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_962",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "convert_element_type_1078",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_963",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1077",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_964",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1078",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "view_965",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_963",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "alias_default_1014",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1014",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_707",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wv",
-      "name": "einsum_default_319",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_710",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wv",
-      "name": "permute_571",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1014",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_571",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wv",
-      "name": "einsum_default_320",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_319",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wv",
-      "name": "permute_572",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_572",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wv",
-      "name": "dtype_cast_352",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_352",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wv",
-      "name": "alias_default_1649",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_964",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "alias_default_1015",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1015",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_707",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wk",
-      "name": "einsum_default_321",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_709",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wk",
-      "name": "permute_575",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1015",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_575",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wk",
-      "name": "einsum_default_322",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_320",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_322",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "add_207",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_321",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wk",
-      "name": "permute_576",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_576",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wk",
-      "name": "dtype_cast_353",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_353",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wk",
-      "name": "alias_default_1648",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_965",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention",
-      "name": "alias_default_1016",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1016",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_707",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wq",
-      "name": "einsum_default_323",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_708",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wq",
-      "name": "permute_579",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1016",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_579",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wq",
-      "name": "einsum_default_324",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_207",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_324",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25",
-      "name": "add_208",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_323",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wq",
-      "name": "permute_580",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_580",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wq",
-      "name": "dtype_cast_354",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_354",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention.wq",
-      "name": "alias_default_1647",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_208",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "convert_element_type_1091",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_703",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "convert_element_type_1092",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_704",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "convert_element_type_1093",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1091",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "alias_default_1017",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1017",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1093",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "mul_366",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1092",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_706",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "mul_367",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_366",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "alias_default_1018",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_367",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "alias_default_1019",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1019",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1018",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "mul_368",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_368",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "sum_43",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1019",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "div_46",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_46",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_43",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "mul_369",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1018",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_369",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "sub_21",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_706",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "mul_370",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1017",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1019",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "mul_371",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_371",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "sum_44",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_370",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "convert_element_type_1094",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_44",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "convert_element_type_1095",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1013",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1094",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "add_209",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1095",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "dtype_cast_355",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_355",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.25.attention_norm",
-      "name": "alias_default_1654",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_209",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "alias_default_1020",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1020",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_701",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "einsum_default_325",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_702",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "permute_583",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1020",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_583",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "einsum_default_326",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_325",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "permute_584",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_584",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "dtype_cast_356",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_356",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "alias_default_1643",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_326",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w2",
-      "name": "alias_default_1021",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1021",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_698",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "mul_372",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1021",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_700",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "mul_373",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_372",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "alias_default_1022",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1022",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_694",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "einsum_default_327",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_699",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "permute_587",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1022",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_587",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "einsum_default_328",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_327",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "permute_588",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_588",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "dtype_cast_357",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_357",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w3",
-      "name": "alias_default_1644",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_373",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "convert_element_type_1104",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_696",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "convert_element_type_1105",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1105",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "alias_default_1023",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1023",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "neg_39",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "exp_39",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "add_210",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_210",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "reciprocal_7",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_7",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "mul_374",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_374",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "alias_default_1024",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1104",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1024",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "mul_375",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1024",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "sub_22",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1023",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "mul_376",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_376",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "add_211",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_375",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_211",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "mul_377",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_377",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "convert_element_type_1106",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1106",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward",
-      "name": "alias_default_1025",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1025",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_694",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "einsum_default_329",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_695",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "permute_591",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1025",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_591",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "einsum_default_330",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_328",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_330",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24",
-      "name": "add_212",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_329",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "permute_592",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_592",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "dtype_cast_358",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_358",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.feed_forward.w1",
-      "name": "alias_default_1642",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_212",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "convert_element_type_1111",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_690",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "convert_element_type_1112",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_691",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "convert_element_type_1113",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1111",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "alias_default_1026",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1026",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1113",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "mul_378",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1112",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_693",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "mul_379",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_378",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "alias_default_1027",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_379",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "alias_default_1028",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1028",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1027",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "mul_380",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_380",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "sum_45",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1028",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "div_47",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_47",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_45",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "mul_381",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1027",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_381",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "sub_23",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_23",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_693",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "mul_382",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1026",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1028",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "mul_383",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_383",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "sum_46",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_382",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "convert_element_type_1114",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_46",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "convert_element_type_1115",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1020",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1114",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "add_213",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1115",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "dtype_cast_359",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_359",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.ffn_norm",
-      "name": "alias_default_1646",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_213",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "alias_default_1029",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1029",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_688",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "einsum_default_331",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_689",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "permute_595",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1029",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_595",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "einsum_default_332",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_331",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "permute_596",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_596",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "dtype_cast_360",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_360",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wo",
-      "name": "alias_default_1641",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_332",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_980",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_980",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "permute_597",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_597",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_684",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_685",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_686",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_687",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_217",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_222",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_223",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_7",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.sdpa",
-      "name": "getitem_309",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.sdpa",
-      "name": "getitem_310",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_7",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.sdpa",
-      "name": "getitem_311",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_311",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "permute_598",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_310",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "permute_599",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_309",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "permute_600",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_598",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_981",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_981",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "sum_47",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_47",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "squeeze_14",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_599",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_982",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_982",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "sum_48",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "squeeze_15",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "convert_element_type_1120",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_600",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "convert_element_type_1121",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1120",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_983",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_983",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_as_complex_78",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_683",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "_conj_14",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_14",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "clone_126",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_78",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_126",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "mul_384",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1121",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_984",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_984",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_as_complex_79",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_683",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "_conj_15",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_15",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "clone_127",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_79",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_127",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "mul_385",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_384",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_as_real_78",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_78",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_985",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_985",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "convert_element_type_1122",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_385",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_as_real_79",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_79",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_986",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_986",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "convert_element_type_1123",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_987",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1122",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_988",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1123",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "view_989",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_987",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "alias_default_1030",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1030",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_679",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wv",
-      "name": "einsum_default_333",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_682",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wv",
-      "name": "permute_603",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1030",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_603",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wv",
-      "name": "einsum_default_334",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_333",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wv",
-      "name": "permute_604",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_604",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wv",
-      "name": "dtype_cast_361",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_361",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wv",
-      "name": "alias_default_1640",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_988",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "alias_default_1031",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1031",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_679",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wk",
-      "name": "einsum_default_335",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_681",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wk",
-      "name": "permute_607",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1031",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_607",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wk",
-      "name": "einsum_default_336",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_334",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_336",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "add_214",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_335",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wk",
-      "name": "permute_608",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_608",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wk",
-      "name": "dtype_cast_362",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_362",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wk",
-      "name": "alias_default_1639",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_989",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention",
-      "name": "alias_default_1032",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1032",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_679",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wq",
-      "name": "einsum_default_337",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_680",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wq",
-      "name": "permute_611",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1032",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_611",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wq",
-      "name": "einsum_default_338",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_214",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_338",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24",
-      "name": "add_215",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_337",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wq",
-      "name": "permute_612",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_612",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wq",
-      "name": "dtype_cast_363",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_363",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention.wq",
-      "name": "alias_default_1638",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_215",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "convert_element_type_1136",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_675",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "convert_element_type_1137",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_676",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "convert_element_type_1138",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1136",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "alias_default_1033",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1033",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1138",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "mul_386",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1137",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_678",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "mul_387",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_386",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "alias_default_1034",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_387",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "alias_default_1035",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1035",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1034",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "mul_388",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_388",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "sum_49",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1035",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "div_48",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_48",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "mul_389",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1034",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_389",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "sub_24",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_678",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "mul_390",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1033",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1035",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "mul_391",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_391",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "sum_50",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_390",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "convert_element_type_1139",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_50",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "convert_element_type_1140",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1029",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1139",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "add_216",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1140",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "dtype_cast_364",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_364",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.24.attention_norm",
-      "name": "alias_default_1645",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_216",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "alias_default_1036",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1036",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_673",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "einsum_default_339",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_674",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "permute_615",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1036",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_615",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "einsum_default_340",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_339",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "permute_616",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_616",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "dtype_cast_365",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_365",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "alias_default_1634",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_340",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w2",
-      "name": "alias_default_1037",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1037",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_670",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "mul_392",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1037",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_672",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "mul_393",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_392",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "alias_default_1038",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1038",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_666",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "einsum_default_341",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_671",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "permute_619",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1038",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_619",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "einsum_default_342",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_341",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "permute_620",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_620",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "dtype_cast_366",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_366",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w3",
-      "name": "alias_default_1635",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_393",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "convert_element_type_1149",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_668",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "convert_element_type_1150",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1150",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "alias_default_1039",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1039",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "neg_40",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "exp_40",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "add_217",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_217",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "reciprocal_8",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_8",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "mul_394",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_394",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "alias_default_1040",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1149",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1040",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "mul_395",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1040",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "sub_25",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1039",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "mul_396",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_396",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "add_218",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_395",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_218",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "mul_397",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_397",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "convert_element_type_1151",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1151",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward",
-      "name": "alias_default_1041",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1041",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_666",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "einsum_default_343",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_667",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "permute_623",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1041",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_623",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "einsum_default_344",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_342",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_344",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23",
-      "name": "add_219",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_343",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "permute_624",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_624",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "dtype_cast_367",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_367",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.feed_forward.w1",
-      "name": "alias_default_1633",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_219",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "convert_element_type_1156",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_662",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "convert_element_type_1157",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_663",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "convert_element_type_1158",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1156",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "alias_default_1042",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1042",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1158",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "mul_398",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1157",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_665",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "mul_399",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_398",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "alias_default_1043",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_399",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "alias_default_1044",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1044",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1043",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "mul_400",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_400",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "sum_51",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1044",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "div_49",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_51",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "mul_401",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1043",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_401",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "sub_26",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_665",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "mul_402",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1042",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1044",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "mul_403",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_403",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "sum_52",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_402",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "convert_element_type_1159",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_52",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "convert_element_type_1160",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1036",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1159",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "add_220",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1160",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "dtype_cast_368",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_368",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.ffn_norm",
-      "name": "alias_default_1637",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_220",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "alias_default_1045",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1045",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_660",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "einsum_default_345",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_661",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "permute_627",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1045",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_627",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "einsum_default_346",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_345",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "permute_628",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_628",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "dtype_cast_369",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_369",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wo",
-      "name": "alias_default_1632",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_346",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_1004",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1004",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "permute_629",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_629",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_656",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_657",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_658",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_659",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_208",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_213",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_214",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_8",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_8",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.sdpa",
-      "name": "getitem_312",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_8",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.sdpa",
-      "name": "getitem_313",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_8",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.sdpa",
-      "name": "getitem_314",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_314",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "permute_630",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_313",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "permute_631",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_312",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "permute_632",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_630",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_1005",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1005",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "sum_53",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "squeeze_16",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_631",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_1006",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1006",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "sum_54",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "squeeze_17",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "convert_element_type_1165",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_632",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "convert_element_type_1166",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1165",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_1007",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1007",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_as_complex_80",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_655",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "_conj_16",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_16",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "clone_134",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_80",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_134",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "mul_404",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1166",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_1008",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1008",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_as_complex_81",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_655",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "_conj_17",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_17",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "clone_135",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_81",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_135",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "mul_405",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_404",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_as_real_80",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_80",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_1009",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1009",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "convert_element_type_1167",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_405",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_as_real_81",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_81",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_1010",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1010",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "convert_element_type_1168",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_1011",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1167",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_1012",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1168",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "view_1013",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1011",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "alias_default_1046",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1046",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_651",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wv",
-      "name": "einsum_default_347",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_654",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wv",
-      "name": "permute_635",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1046",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_635",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wv",
-      "name": "einsum_default_348",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_347",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wv",
-      "name": "permute_636",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_636",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wv",
-      "name": "dtype_cast_370",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_370",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wv",
-      "name": "alias_default_1631",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1012",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "alias_default_1047",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1047",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_651",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wk",
-      "name": "einsum_default_349",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_653",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wk",
-      "name": "permute_639",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1047",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_639",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wk",
-      "name": "einsum_default_350",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_348",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_350",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "add_221",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_349",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wk",
-      "name": "permute_640",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_640",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wk",
-      "name": "dtype_cast_371",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_371",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wk",
-      "name": "alias_default_1630",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1013",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention",
-      "name": "alias_default_1048",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1048",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_651",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wq",
-      "name": "einsum_default_351",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_652",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wq",
-      "name": "permute_643",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1048",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_643",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wq",
-      "name": "einsum_default_352",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_221",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_352",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23",
-      "name": "add_222",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_351",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wq",
-      "name": "permute_644",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_644",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wq",
-      "name": "dtype_cast_372",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_372",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention.wq",
-      "name": "alias_default_1629",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_222",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "convert_element_type_1181",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_647",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "convert_element_type_1182",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_648",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "convert_element_type_1183",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1181",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "alias_default_1049",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1049",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1183",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "mul_406",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1182",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_650",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "mul_407",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_406",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "alias_default_1050",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_407",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "alias_default_1051",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1051",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1050",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "mul_408",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_408",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "sum_55",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1051",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "div_50",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_50",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_55",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "mul_409",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1050",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_409",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "sub_27",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_650",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "mul_410",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1049",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1051",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "mul_411",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_411",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "sum_56",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_410",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "convert_element_type_1184",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_56",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "convert_element_type_1185",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1045",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1184",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "add_223",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1185",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "dtype_cast_373",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_373",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.23.attention_norm",
-      "name": "alias_default_1636",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_223",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "alias_default_1052",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1052",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_645",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "einsum_default_353",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_646",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "permute_647",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1052",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_647",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "einsum_default_354",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_353",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "permute_648",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_648",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "dtype_cast_374",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_374",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "alias_default_1625",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_354",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w2",
-      "name": "alias_default_1053",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1053",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_642",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "mul_412",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1053",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_644",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "mul_413",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_412",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "alias_default_1054",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1054",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_638",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "einsum_default_355",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_643",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "permute_651",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1054",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_651",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "einsum_default_356",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_355",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "permute_652",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_652",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "dtype_cast_375",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_375",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w3",
-      "name": "alias_default_1626",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_413",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "convert_element_type_1194",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_640",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "convert_element_type_1195",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1195",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "alias_default_1055",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1055",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "neg_41",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_41",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "exp_41",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_41",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "add_224",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_224",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "reciprocal_9",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_9",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "mul_414",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_414",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "alias_default_1056",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1194",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1056",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "mul_415",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1056",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "sub_28",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1055",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "mul_416",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_416",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "add_225",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_415",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_225",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "mul_417",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_417",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "convert_element_type_1196",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1196",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward",
-      "name": "alias_default_1057",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1057",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_638",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "einsum_default_357",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_639",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "permute_655",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1057",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_655",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "einsum_default_358",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_356",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_358",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22",
-      "name": "add_226",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_357",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "permute_656",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_656",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "dtype_cast_376",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_376",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.feed_forward.w1",
-      "name": "alias_default_1624",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_226",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "convert_element_type_1201",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_634",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "convert_element_type_1202",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_635",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "convert_element_type_1203",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1201",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "alias_default_1058",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1058",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1203",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "mul_418",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1202",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_637",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "mul_419",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_418",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "alias_default_1059",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_419",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "alias_default_1060",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1060",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1059",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "mul_420",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_420",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "sum_57",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1060",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "div_51",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_51",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_57",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "mul_421",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1059",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_421",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "sub_29",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_637",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "mul_422",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1058",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1060",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "mul_423",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_423",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "sum_58",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_422",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "convert_element_type_1204",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_58",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "convert_element_type_1205",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1052",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1204",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "add_227",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1205",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "dtype_cast_377",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_377",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.ffn_norm",
-      "name": "alias_default_1628",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_227",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "alias_default_1061",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1061",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_632",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "einsum_default_359",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_633",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "permute_659",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1061",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_659",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "einsum_default_360",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_359",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "permute_660",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_660",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "dtype_cast_378",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_378",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wo",
-      "name": "alias_default_1623",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_360",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_1028",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1028",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "permute_661",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_661",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_628",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_629",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_630",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_631",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_199",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_204",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_205",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_9",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.sdpa",
-      "name": "getitem_315",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.sdpa",
-      "name": "getitem_316",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_9",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.sdpa",
-      "name": "getitem_317",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_317",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "permute_662",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_316",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "permute_663",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_315",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "permute_664",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_662",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_1029",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1029",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "sum_59",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "squeeze_18",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_663",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_1030",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1030",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "sum_60",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "squeeze_19",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "convert_element_type_1210",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_664",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "convert_element_type_1211",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1210",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_1031",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1031",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_as_complex_82",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_627",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "_conj_18",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_18",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "clone_142",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_82",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_142",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "mul_424",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1211",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_1032",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1032",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_as_complex_83",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_627",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "_conj_19",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_19",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "clone_143",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_83",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_143",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "mul_425",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_424",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_as_real_82",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_82",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_1033",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1033",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "convert_element_type_1212",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_425",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_as_real_83",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_83",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_1034",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1034",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "convert_element_type_1213",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_1035",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1212",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_1036",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1213",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "view_1037",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1035",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "alias_default_1062",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1062",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_623",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wv",
-      "name": "einsum_default_361",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_626",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wv",
-      "name": "permute_667",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1062",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_667",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wv",
-      "name": "einsum_default_362",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_361",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wv",
-      "name": "permute_668",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_668",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wv",
-      "name": "dtype_cast_379",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_379",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wv",
-      "name": "alias_default_1622",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1036",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "alias_default_1063",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1063",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_623",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wk",
-      "name": "einsum_default_363",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_625",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wk",
-      "name": "permute_671",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1063",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_671",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wk",
-      "name": "einsum_default_364",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_362",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_364",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "add_228",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_363",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wk",
-      "name": "permute_672",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_672",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wk",
-      "name": "dtype_cast_380",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_380",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wk",
-      "name": "alias_default_1621",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1037",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention",
-      "name": "alias_default_1064",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1064",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_623",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wq",
-      "name": "einsum_default_365",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_624",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wq",
-      "name": "permute_675",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1064",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_675",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wq",
-      "name": "einsum_default_366",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_228",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_366",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22",
-      "name": "add_229",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_365",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wq",
-      "name": "permute_676",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_676",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wq",
-      "name": "dtype_cast_381",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_381",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention.wq",
-      "name": "alias_default_1620",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_229",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "convert_element_type_1226",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_619",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "convert_element_type_1227",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_620",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "convert_element_type_1228",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1226",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "alias_default_1065",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1065",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1228",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "mul_426",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1227",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_622",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "mul_427",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_426",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "alias_default_1066",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_427",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "alias_default_1067",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1067",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1066",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "mul_428",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_428",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "sum_61",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1067",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "div_52",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_52",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_61",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "mul_429",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1066",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_429",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "sub_30",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_30",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_622",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "mul_430",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1065",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1067",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "mul_431",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_431",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "sum_62",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_430",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "convert_element_type_1229",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_62",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "convert_element_type_1230",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1061",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1229",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "add_230",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1230",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "dtype_cast_382",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_382",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.22.attention_norm",
-      "name": "alias_default_1627",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_230",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "alias_default_1068",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1068",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_617",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "einsum_default_367",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_618",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "permute_679",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1068",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_679",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "einsum_default_368",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_367",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "permute_680",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_680",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "dtype_cast_383",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_383",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "alias_default_1616",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_368",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w2",
-      "name": "alias_default_1069",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1069",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_614",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "mul_432",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1069",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_616",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "mul_433",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_432",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "alias_default_1070",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1070",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_610",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "einsum_default_369",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_615",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "permute_683",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1070",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_683",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "einsum_default_370",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_369",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "permute_684",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_684",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "dtype_cast_384",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_384",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w3",
-      "name": "alias_default_1617",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_433",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "convert_element_type_1239",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_612",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "convert_element_type_1240",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1240",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "alias_default_1071",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1071",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "neg_42",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "exp_42",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "add_231",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_231",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "reciprocal_10",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_10",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "mul_434",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_434",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "alias_default_1072",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1239",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1072",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "mul_435",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1072",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "sub_31",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1071",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "mul_436",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_436",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "add_232",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_435",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_232",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "mul_437",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_437",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "convert_element_type_1241",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1241",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward",
-      "name": "alias_default_1073",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1073",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_610",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "einsum_default_371",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_611",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "permute_687",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1073",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_687",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "einsum_default_372",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_370",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_372",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21",
-      "name": "add_233",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_371",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "permute_688",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_688",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "dtype_cast_385",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_385",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.feed_forward.w1",
-      "name": "alias_default_1615",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_233",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "convert_element_type_1246",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_606",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "convert_element_type_1247",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_607",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "convert_element_type_1248",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1246",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "alias_default_1074",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1074",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1248",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "mul_438",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1247",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_609",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "mul_439",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_438",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "alias_default_1075",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_439",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "alias_default_1076",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1076",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1075",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "mul_440",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_440",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "sum_63",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1076",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "div_53",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_53",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_63",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "mul_441",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1075",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_441",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "sub_32",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_32",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_609",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "mul_442",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1074",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1076",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "mul_443",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_443",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "sum_64",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_442",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "convert_element_type_1249",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_64",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "convert_element_type_1250",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1068",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1249",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "add_234",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1250",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "dtype_cast_386",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_386",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.ffn_norm",
-      "name": "alias_default_1619",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_234",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "alias_default_1077",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1077",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_604",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "einsum_default_373",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_605",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "permute_691",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1077",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_691",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "einsum_default_374",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_373",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "permute_692",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_692",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "dtype_cast_387",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_387",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wo",
-      "name": "alias_default_1614",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_374",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_1052",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1052",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "permute_693",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_693",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_600",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_601",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_602",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_603",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_190",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_195",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_196",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_10",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.sdpa",
-      "name": "getitem_318",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.sdpa",
-      "name": "getitem_319",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.sdpa",
-      "name": "getitem_320",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_320",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "permute_694",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_319",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "permute_695",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_318",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "permute_696",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_694",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_1053",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1053",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "sum_65",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_65",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "squeeze_20",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_695",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_1054",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1054",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "sum_66",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_66",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "squeeze_21",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "convert_element_type_1255",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_696",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "convert_element_type_1256",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1255",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_1055",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1055",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_as_complex_84",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_599",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "_conj_20",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_20",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "clone_150",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_84",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_150",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "mul_444",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1256",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_1056",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1056",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_as_complex_85",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_599",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "_conj_21",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_21",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "clone_151",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_85",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_151",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "mul_445",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_444",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_as_real_84",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_84",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_1057",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1057",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "convert_element_type_1257",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_445",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_as_real_85",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_85",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_1058",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1058",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "convert_element_type_1258",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_1059",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1257",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_1060",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1258",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "view_1061",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1059",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "alias_default_1078",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1078",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_595",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wv",
-      "name": "einsum_default_375",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_598",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wv",
-      "name": "permute_699",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1078",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_699",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wv",
-      "name": "einsum_default_376",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_375",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wv",
-      "name": "permute_700",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_700",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wv",
-      "name": "dtype_cast_388",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_388",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wv",
-      "name": "alias_default_1613",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1060",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "alias_default_1079",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1079",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_595",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wk",
-      "name": "einsum_default_377",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_597",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wk",
-      "name": "permute_703",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1079",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_703",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wk",
-      "name": "einsum_default_378",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_376",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_378",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "add_235",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_377",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wk",
-      "name": "permute_704",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_704",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wk",
-      "name": "dtype_cast_389",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_389",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wk",
-      "name": "alias_default_1612",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1061",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention",
-      "name": "alias_default_1080",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1080",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_595",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wq",
-      "name": "einsum_default_379",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_596",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wq",
-      "name": "permute_707",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1080",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_707",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wq",
-      "name": "einsum_default_380",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_235",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_380",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21",
-      "name": "add_236",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_379",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wq",
-      "name": "permute_708",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_708",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wq",
-      "name": "dtype_cast_390",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_390",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention.wq",
-      "name": "alias_default_1611",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_236",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "convert_element_type_1271",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_591",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "convert_element_type_1272",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_592",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "convert_element_type_1273",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1271",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "alias_default_1081",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1081",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1273",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "mul_446",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1272",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_594",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "mul_447",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_446",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "alias_default_1082",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_447",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "alias_default_1083",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1083",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1082",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "mul_448",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_448",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "sum_67",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1083",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "div_54",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_54",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_67",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "mul_449",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1082",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_449",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "sub_33",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_33",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_594",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "mul_450",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1081",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1083",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "mul_451",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_451",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "sum_68",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_450",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "convert_element_type_1274",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_68",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "convert_element_type_1275",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1077",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1274",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "add_237",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1275",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "dtype_cast_391",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_391",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.21.attention_norm",
-      "name": "alias_default_1618",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_237",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "alias_default_1084",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1084",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_589",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "einsum_default_381",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_590",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "permute_711",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1084",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_711",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "einsum_default_382",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_381",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "permute_712",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_712",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "dtype_cast_392",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_392",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "alias_default_1607",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_382",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w2",
-      "name": "alias_default_1085",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1085",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_586",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "mul_452",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1085",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_588",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "mul_453",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_452",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "alias_default_1086",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1086",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_582",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "einsum_default_383",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_587",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "permute_715",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1086",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_715",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "einsum_default_384",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_383",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "permute_716",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_716",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "dtype_cast_393",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_393",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w3",
-      "name": "alias_default_1608",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_453",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "convert_element_type_1284",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_584",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "convert_element_type_1285",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1285",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "alias_default_1087",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1087",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "neg_43",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "exp_43",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "add_238",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_238",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "reciprocal_11",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_11",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "mul_454",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_454",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "alias_default_1088",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1284",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1088",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "mul_455",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1088",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "sub_34",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1087",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "mul_456",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_456",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "add_239",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_455",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_239",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "mul_457",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_457",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "convert_element_type_1286",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1286",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward",
-      "name": "alias_default_1089",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1089",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_582",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "einsum_default_385",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_583",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "permute_719",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1089",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_719",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "einsum_default_386",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_384",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_386",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20",
-      "name": "add_240",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_385",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "permute_720",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_720",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "dtype_cast_394",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_394",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.feed_forward.w1",
-      "name": "alias_default_1606",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_240",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "convert_element_type_1291",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_578",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "convert_element_type_1292",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_579",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "convert_element_type_1293",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1291",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "alias_default_1090",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1090",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1293",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "mul_458",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1292",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_581",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "mul_459",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_458",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "alias_default_1091",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_459",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "alias_default_1092",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1092",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1091",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "mul_460",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_460",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "sum_69",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1092",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "div_55",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_55",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_69",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "mul_461",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1091",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_461",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "sub_35",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_35",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_581",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "mul_462",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1090",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1092",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "mul_463",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_463",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "sum_70",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_462",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "convert_element_type_1294",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_70",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "convert_element_type_1295",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1084",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1294",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "add_241",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1295",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "dtype_cast_395",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_395",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.ffn_norm",
-      "name": "alias_default_1610",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_241",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "alias_default_1093",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1093",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_576",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "einsum_default_387",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_577",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "permute_723",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1093",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_723",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "einsum_default_388",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_387",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "permute_724",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_724",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "dtype_cast_396",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_396",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wo",
-      "name": "alias_default_1605",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_388",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_1076",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1076",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "permute_725",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_725",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_572",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_573",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_574",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_575",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_181",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_186",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_187",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_11",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.sdpa",
-      "name": "getitem_321",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.sdpa",
-      "name": "getitem_322",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_11",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.sdpa",
-      "name": "getitem_323",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_323",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "permute_726",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_322",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "permute_727",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_321",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "permute_728",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_726",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_1077",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1077",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "sum_71",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_71",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "squeeze_22",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_727",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_1078",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1078",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "sum_72",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_72",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "squeeze_23",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "convert_element_type_1300",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_728",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "convert_element_type_1301",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1300",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_1079",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1079",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_as_complex_86",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_571",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "_conj_22",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_22",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "clone_158",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_86",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_158",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "mul_464",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1301",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_1080",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1080",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_as_complex_87",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_571",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "_conj_23",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_23",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "clone_159",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_87",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_159",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "mul_465",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_464",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_as_real_86",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_86",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_1081",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1081",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "convert_element_type_1302",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_465",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_as_real_87",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_87",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_1082",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1082",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "convert_element_type_1303",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_1083",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1302",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_1084",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1303",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "view_1085",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1083",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "alias_default_1094",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1094",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_567",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wv",
-      "name": "einsum_default_389",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_570",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wv",
-      "name": "permute_731",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1094",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_731",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wv",
-      "name": "einsum_default_390",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_389",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wv",
-      "name": "permute_732",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_732",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wv",
-      "name": "dtype_cast_397",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_397",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wv",
-      "name": "alias_default_1604",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1084",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "alias_default_1095",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1095",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_567",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wk",
-      "name": "einsum_default_391",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_569",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wk",
-      "name": "permute_735",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1095",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_735",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wk",
-      "name": "einsum_default_392",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_390",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_392",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "add_242",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_391",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wk",
-      "name": "permute_736",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_736",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wk",
-      "name": "dtype_cast_398",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_398",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wk",
-      "name": "alias_default_1603",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1085",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention",
-      "name": "alias_default_1096",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1096",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_567",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wq",
-      "name": "einsum_default_393",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_568",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wq",
-      "name": "permute_739",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1096",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_739",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wq",
-      "name": "einsum_default_394",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_242",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_394",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20",
-      "name": "add_243",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_393",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wq",
-      "name": "permute_740",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_740",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wq",
-      "name": "dtype_cast_399",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_399",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention.wq",
-      "name": "alias_default_1602",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_243",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "convert_element_type_1316",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_563",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "convert_element_type_1317",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_564",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "convert_element_type_1318",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1316",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "alias_default_1097",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1097",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1318",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "mul_466",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1317",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_566",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "mul_467",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_466",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "alias_default_1098",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_467",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "alias_default_1099",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1099",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1098",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "mul_468",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_468",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "sum_73",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1099",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "div_56",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_56",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_73",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "mul_469",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1098",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_469",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "sub_36",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_36",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_566",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "mul_470",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1097",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1099",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "mul_471",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_471",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "sum_74",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_470",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "convert_element_type_1319",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_74",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "convert_element_type_1320",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1093",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1319",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "add_244",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1320",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "dtype_cast_400",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_400",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.20.attention_norm",
-      "name": "alias_default_1609",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_244",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "alias_default_1100",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1100",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_561",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "einsum_default_395",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_562",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "permute_743",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1100",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_743",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "einsum_default_396",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_395",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "permute_744",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_744",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "dtype_cast_401",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_401",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "alias_default_1598",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_396",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w2",
-      "name": "alias_default_1101",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1101",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_558",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "mul_472",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1101",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_560",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "mul_473",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_472",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "alias_default_1102",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1102",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_554",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "einsum_default_397",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_559",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "permute_747",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1102",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_747",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "einsum_default_398",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_397",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "permute_748",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_748",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "dtype_cast_402",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_402",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w3",
-      "name": "alias_default_1599",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_473",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "convert_element_type_1329",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_556",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "convert_element_type_1330",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1330",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "alias_default_1103",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1103",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "neg_44",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_44",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "exp_44",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_44",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "add_245",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_245",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "reciprocal_12",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_12",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "mul_474",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_474",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "alias_default_1104",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1329",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1104",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "mul_475",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1104",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "sub_37",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1103",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "mul_476",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_476",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "add_246",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_475",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_246",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "mul_477",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_477",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "convert_element_type_1331",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1331",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward",
-      "name": "alias_default_1105",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1105",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_554",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "einsum_default_399",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_555",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "permute_751",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1105",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_751",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "einsum_default_400",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_398",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_400",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19",
-      "name": "add_247",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_399",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "permute_752",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_752",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "dtype_cast_403",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_403",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.feed_forward.w1",
-      "name": "alias_default_1597",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_247",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "convert_element_type_1336",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_550",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "convert_element_type_1337",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_551",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "convert_element_type_1338",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1336",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "alias_default_1106",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1106",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1338",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "mul_478",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1337",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_553",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "mul_479",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_478",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "alias_default_1107",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_479",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "alias_default_1108",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1108",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1107",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "mul_480",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_480",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "sum_75",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1108",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "div_57",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_57",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_75",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "mul_481",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1107",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_481",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "sub_38",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_38",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_553",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "mul_482",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1106",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1108",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "mul_483",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_483",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "sum_76",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_482",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "convert_element_type_1339",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_76",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "convert_element_type_1340",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1100",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1339",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "add_248",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1340",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "dtype_cast_404",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_404",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.ffn_norm",
-      "name": "alias_default_1601",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_248",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "alias_default_1109",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1109",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_548",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "einsum_default_401",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_549",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "permute_755",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1109",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_755",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "einsum_default_402",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_401",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "permute_756",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_756",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "dtype_cast_405",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_405",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wo",
-      "name": "alias_default_1596",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_402",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_1100",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1100",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "permute_757",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_757",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_544",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_545",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_546",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_547",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_172",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_177",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_178",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_12",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.sdpa",
-      "name": "getitem_324",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.sdpa",
-      "name": "getitem_325",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.sdpa",
-      "name": "getitem_326",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_326",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "permute_758",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_325",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "permute_759",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_324",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "permute_760",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_758",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_1101",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1101",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "sum_77",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_77",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "squeeze_24",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_759",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_1102",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1102",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "sum_78",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_78",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "squeeze_25",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "convert_element_type_1345",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_760",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "convert_element_type_1346",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1345",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_1103",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1103",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_as_complex_88",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_543",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "_conj_24",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_24",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "clone_166",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_88",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_166",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "mul_484",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1346",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_1104",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1104",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_as_complex_89",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_543",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "_conj_25",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_25",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "clone_167",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_89",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_167",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "mul_485",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_484",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_as_real_88",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_88",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_1105",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1105",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "convert_element_type_1347",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_485",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_as_real_89",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_89",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_1106",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1106",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "convert_element_type_1348",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_1107",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1347",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_1108",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1348",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "view_1109",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1107",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "alias_default_1110",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1110",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_539",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wv",
-      "name": "einsum_default_403",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_542",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wv",
-      "name": "permute_763",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1110",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_763",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wv",
-      "name": "einsum_default_404",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_403",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wv",
-      "name": "permute_764",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_764",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wv",
-      "name": "dtype_cast_406",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_406",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wv",
-      "name": "alias_default_1595",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1108",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "alias_default_1111",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1111",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_539",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wk",
-      "name": "einsum_default_405",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_541",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wk",
-      "name": "permute_767",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1111",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_767",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wk",
-      "name": "einsum_default_406",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_404",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_406",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "add_249",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_405",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wk",
-      "name": "permute_768",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_768",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wk",
-      "name": "dtype_cast_407",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_407",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wk",
-      "name": "alias_default_1594",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1109",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention",
-      "name": "alias_default_1112",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1112",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_539",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wq",
-      "name": "einsum_default_407",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_540",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wq",
-      "name": "permute_771",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1112",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_771",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wq",
-      "name": "einsum_default_408",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_249",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_408",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19",
-      "name": "add_250",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_407",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wq",
-      "name": "permute_772",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_772",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wq",
-      "name": "dtype_cast_408",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_408",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention.wq",
-      "name": "alias_default_1593",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_250",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "convert_element_type_1361",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_535",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "convert_element_type_1362",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_536",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "convert_element_type_1363",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1361",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "alias_default_1113",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1113",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1363",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "mul_486",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1362",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_538",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "mul_487",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_486",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "alias_default_1114",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_487",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "alias_default_1115",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1114",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "mul_488",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_488",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "sum_79",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "div_58",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_58",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_79",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "mul_489",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1114",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_489",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "sub_39",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_39",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_538",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "mul_490",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1113",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "mul_491",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_491",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "sum_80",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_490",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "convert_element_type_1364",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_80",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "convert_element_type_1365",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1109",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1364",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "add_251",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1365",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "dtype_cast_409",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_409",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.19.attention_norm",
-      "name": "alias_default_1600",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_251",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "alias_default_1116",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1116",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_533",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "einsum_default_409",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_534",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "permute_775",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1116",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_775",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "einsum_default_410",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_409",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "permute_776",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_776",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "dtype_cast_410",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_410",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "alias_default_1589",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_410",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w2",
-      "name": "alias_default_1117",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1117",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_530",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "mul_492",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1117",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_532",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "mul_493",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_492",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "alias_default_1118",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1118",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_526",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "einsum_default_411",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_531",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "permute_779",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1118",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_779",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "einsum_default_412",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_411",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "permute_780",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_780",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "dtype_cast_411",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_411",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w3",
-      "name": "alias_default_1590",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_493",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "convert_element_type_1374",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_528",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "convert_element_type_1375",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1375",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "alias_default_1119",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1119",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "neg_45",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_45",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "exp_45",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_45",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "add_252",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_252",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "reciprocal_13",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_13",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "mul_494",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_494",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "alias_default_1120",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1374",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1120",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "mul_495",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1120",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "sub_40",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1119",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "mul_496",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_496",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "add_253",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_495",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_253",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "mul_497",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_497",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "convert_element_type_1376",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1376",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward",
-      "name": "alias_default_1121",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1121",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_526",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "einsum_default_413",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_527",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "permute_783",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1121",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_783",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "einsum_default_414",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_412",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_414",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18",
-      "name": "add_254",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_413",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "permute_784",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_784",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "dtype_cast_412",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_412",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.feed_forward.w1",
-      "name": "alias_default_1588",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_254",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "convert_element_type_1381",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_522",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "convert_element_type_1382",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_523",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "convert_element_type_1383",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1381",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "alias_default_1122",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1122",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1383",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "mul_498",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1382",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_525",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "mul_499",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_498",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "alias_default_1123",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_499",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "alias_default_1124",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1124",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1123",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "mul_500",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_500",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "sum_81",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1124",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "div_59",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_81",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "mul_501",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1123",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_501",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "sub_41",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_41",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_525",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "mul_502",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1122",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1124",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "mul_503",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_503",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "sum_82",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_502",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "convert_element_type_1384",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_82",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "convert_element_type_1385",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1116",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1384",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "add_255",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1385",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "dtype_cast_413",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_413",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.ffn_norm",
-      "name": "alias_default_1592",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_255",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "alias_default_1125",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1125",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_520",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "einsum_default_415",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_521",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "permute_787",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1125",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_787",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "einsum_default_416",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_415",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "permute_788",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_788",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "dtype_cast_414",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_414",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wo",
-      "name": "alias_default_1587",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_416",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_1124",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1124",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "permute_789",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_789",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_516",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_517",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_518",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_519",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_163",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_168",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_169",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_13",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.sdpa",
-      "name": "getitem_327",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.sdpa",
-      "name": "getitem_328",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.sdpa",
-      "name": "getitem_329",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_329",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "permute_790",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_328",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "permute_791",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_327",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "permute_792",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_790",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_1125",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1125",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "sum_83",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_83",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "squeeze_26",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_791",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_1126",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1126",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "sum_84",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_84",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "squeeze_27",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "convert_element_type_1390",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_792",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "convert_element_type_1391",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1390",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_1127",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1127",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_as_complex_90",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_515",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "_conj_26",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_26",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "clone_174",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_90",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_174",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "mul_504",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1391",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_1128",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1128",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_as_complex_91",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_515",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "_conj_27",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_27",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "clone_175",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_91",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_175",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "mul_505",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_504",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_as_real_90",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_90",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_1129",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1129",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "convert_element_type_1392",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_505",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_as_real_91",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_91",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_1130",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1130",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "convert_element_type_1393",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_1131",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1392",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_1132",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1393",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "view_1133",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1131",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "alias_default_1126",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1126",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_511",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wv",
-      "name": "einsum_default_417",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_514",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wv",
-      "name": "permute_795",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1126",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_795",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wv",
-      "name": "einsum_default_418",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_417",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wv",
-      "name": "permute_796",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_796",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wv",
-      "name": "dtype_cast_415",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_415",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wv",
-      "name": "alias_default_1586",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1132",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "alias_default_1127",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1127",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_511",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wk",
-      "name": "einsum_default_419",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_513",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wk",
-      "name": "permute_799",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1127",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_799",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wk",
-      "name": "einsum_default_420",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_418",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_420",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "add_256",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_419",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wk",
-      "name": "permute_800",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_800",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wk",
-      "name": "dtype_cast_416",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_416",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wk",
-      "name": "alias_default_1585",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1133",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention",
-      "name": "alias_default_1128",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1128",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_511",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wq",
-      "name": "einsum_default_421",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_512",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wq",
-      "name": "permute_803",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1128",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_803",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wq",
-      "name": "einsum_default_422",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_256",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_422",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18",
-      "name": "add_257",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_421",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wq",
-      "name": "permute_804",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_804",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wq",
-      "name": "dtype_cast_417",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_417",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention.wq",
-      "name": "alias_default_1584",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_257",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "convert_element_type_1406",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_507",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "convert_element_type_1407",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_508",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "convert_element_type_1408",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1406",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "alias_default_1129",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1129",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1408",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "mul_506",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1407",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_510",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "mul_507",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_506",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "alias_default_1130",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_507",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "alias_default_1131",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1131",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1130",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "mul_508",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_508",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "sum_85",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1131",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "div_60",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_60",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_85",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "mul_509",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1130",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_509",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "sub_42",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_42",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_510",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "mul_510",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1129",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1131",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "mul_511",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_511",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "sum_86",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_510",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "convert_element_type_1409",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_86",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "convert_element_type_1410",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1125",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1409",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "add_258",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1410",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "dtype_cast_418",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_418",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.18.attention_norm",
-      "name": "alias_default_1591",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_258",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "alias_default_1132",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1132",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_505",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "einsum_default_423",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_506",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "permute_807",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1132",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_807",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "einsum_default_424",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_423",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "permute_808",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_808",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "dtype_cast_419",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_419",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "alias_default_1580",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_424",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w2",
-      "name": "alias_default_1133",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1133",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_502",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "mul_512",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1133",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_504",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "mul_513",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_512",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "alias_default_1134",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1134",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_498",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "einsum_default_425",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_503",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "permute_811",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1134",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_811",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "einsum_default_426",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_425",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "permute_812",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_812",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "dtype_cast_420",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_420",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w3",
-      "name": "alias_default_1581",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_513",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "convert_element_type_1419",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_500",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "convert_element_type_1420",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1420",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "alias_default_1135",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1135",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "neg_46",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_46",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "exp_46",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_46",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "add_259",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_259",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "reciprocal_14",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_14",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "mul_514",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_514",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "alias_default_1136",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1419",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1136",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "mul_515",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1136",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "sub_43",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1135",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "mul_516",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_516",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "add_260",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_515",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_260",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "mul_517",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_517",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "convert_element_type_1421",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1421",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward",
-      "name": "alias_default_1137",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1137",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_498",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "einsum_default_427",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_499",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "permute_815",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1137",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_815",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "einsum_default_428",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_426",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_428",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17",
-      "name": "add_261",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_427",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "permute_816",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_816",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "dtype_cast_421",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_421",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.feed_forward.w1",
-      "name": "alias_default_1579",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_261",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "convert_element_type_1426",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_494",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "convert_element_type_1427",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_495",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "convert_element_type_1428",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1426",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "alias_default_1138",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1138",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1428",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "mul_518",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1427",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_497",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "mul_519",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_518",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "alias_default_1139",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_519",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "alias_default_1140",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1140",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1139",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "mul_520",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_520",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "sum_87",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1140",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "div_61",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_61",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_87",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "mul_521",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1139",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_521",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "sub_44",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_44",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_497",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "mul_522",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1138",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1140",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "mul_523",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_523",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "sum_88",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_522",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "convert_element_type_1429",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_88",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "convert_element_type_1430",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1132",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1429",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "add_262",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1430",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "dtype_cast_422",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_422",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.ffn_norm",
-      "name": "alias_default_1583",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_262",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "alias_default_1141",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1141",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_492",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "einsum_default_429",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_493",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "permute_819",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1141",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_819",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "einsum_default_430",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_429",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "permute_820",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_820",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "dtype_cast_423",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_423",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wo",
-      "name": "alias_default_1578",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_430",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_1148",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1148",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "permute_821",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_821",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_488",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_489",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_490",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_491",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_154",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_159",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_160",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_14",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.sdpa",
-      "name": "getitem_330",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.sdpa",
-      "name": "getitem_331",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.sdpa",
-      "name": "getitem_332",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_332",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "permute_822",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_331",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "permute_823",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_330",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "permute_824",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_822",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_1149",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1149",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "sum_89",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_89",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "squeeze_28",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_823",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_1150",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1150",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "sum_90",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_90",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "squeeze_29",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "convert_element_type_1435",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_824",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "convert_element_type_1436",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1435",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_1151",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1151",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_as_complex_92",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_487",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "_conj_28",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_28",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "clone_182",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_92",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_182",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "mul_524",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1436",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_1152",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1152",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_as_complex_93",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_487",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "_conj_29",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_29",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "clone_183",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_93",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_183",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "mul_525",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_524",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_as_real_92",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_92",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_1153",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1153",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "convert_element_type_1437",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_525",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_as_real_93",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_93",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_1154",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1154",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "convert_element_type_1438",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_1155",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1437",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_1156",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1438",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "view_1157",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1155",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "alias_default_1142",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1142",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_483",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wv",
-      "name": "einsum_default_431",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_486",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wv",
-      "name": "permute_827",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1142",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_827",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wv",
-      "name": "einsum_default_432",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_431",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wv",
-      "name": "permute_828",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_828",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wv",
-      "name": "dtype_cast_424",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_424",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wv",
-      "name": "alias_default_1577",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1156",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "alias_default_1143",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1143",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_483",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wk",
-      "name": "einsum_default_433",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_485",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wk",
-      "name": "permute_831",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1143",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_831",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wk",
-      "name": "einsum_default_434",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_432",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_434",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "add_263",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_433",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wk",
-      "name": "permute_832",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_832",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wk",
-      "name": "dtype_cast_425",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_425",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wk",
-      "name": "alias_default_1576",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1157",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention",
-      "name": "alias_default_1144",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1144",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_483",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wq",
-      "name": "einsum_default_435",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_484",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wq",
-      "name": "permute_835",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1144",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_835",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wq",
-      "name": "einsum_default_436",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_263",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_436",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17",
-      "name": "add_264",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_435",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wq",
-      "name": "permute_836",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_836",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wq",
-      "name": "dtype_cast_426",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_426",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention.wq",
-      "name": "alias_default_1575",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_264",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "convert_element_type_1451",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_479",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "convert_element_type_1452",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_480",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "convert_element_type_1453",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1451",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "alias_default_1145",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1145",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1453",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "mul_526",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1452",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_482",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "mul_527",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_526",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "alias_default_1146",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_527",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "alias_default_1147",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1147",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1146",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "mul_528",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_528",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "sum_91",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1147",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "div_62",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_62",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_91",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "mul_529",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1146",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_529",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "sub_45",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_45",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_482",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "mul_530",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1145",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1147",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "mul_531",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_531",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "sum_92",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_530",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "convert_element_type_1454",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_92",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "convert_element_type_1455",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1141",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1454",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "add_265",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1455",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "dtype_cast_427",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_427",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.17.attention_norm",
-      "name": "alias_default_1582",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_265",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "alias_default_1148",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1148",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_477",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "einsum_default_437",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_478",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "permute_839",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1148",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_839",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "einsum_default_438",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_437",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "permute_840",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_840",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "dtype_cast_428",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_428",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "alias_default_1571",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_438",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w2",
-      "name": "alias_default_1149",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1149",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_474",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "mul_532",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1149",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_476",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "mul_533",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_532",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "alias_default_1150",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1150",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_470",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "einsum_default_439",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_475",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "permute_843",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1150",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_843",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "einsum_default_440",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_439",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "permute_844",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_844",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "dtype_cast_429",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_429",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w3",
-      "name": "alias_default_1572",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_533",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "convert_element_type_1464",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_472",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "convert_element_type_1465",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1465",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "alias_default_1151",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1151",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "neg_47",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_47",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "exp_47",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_47",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "add_266",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_266",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "reciprocal_15",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_15",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "mul_534",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_534",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "alias_default_1152",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1464",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1152",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "mul_535",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1152",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "sub_46",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1151",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_46",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "mul_536",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_536",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "add_267",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_535",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_267",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "mul_537",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_537",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "convert_element_type_1466",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1466",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward",
-      "name": "alias_default_1153",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1153",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_470",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "einsum_default_441",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_471",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "permute_847",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1153",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_847",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "einsum_default_442",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_440",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_442",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16",
-      "name": "add_268",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_441",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "permute_848",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_848",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "dtype_cast_430",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_430",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.feed_forward.w1",
-      "name": "alias_default_1570",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_268",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "convert_element_type_1471",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_466",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "convert_element_type_1472",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_467",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "convert_element_type_1473",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1471",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "alias_default_1154",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1154",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1473",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "mul_538",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1472",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_469",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "mul_539",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_538",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "alias_default_1155",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_539",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "alias_default_1156",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1156",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1155",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "mul_540",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_540",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "sum_93",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1156",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "div_63",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_63",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_93",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "mul_541",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1155",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_541",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "sub_47",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_47",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_469",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "mul_542",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1154",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1156",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "mul_543",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_543",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "sum_94",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_542",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "convert_element_type_1474",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_94",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "convert_element_type_1475",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1148",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1474",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "add_269",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1475",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "dtype_cast_431",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_431",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.ffn_norm",
-      "name": "alias_default_1574",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_269",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "alias_default_1157",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1157",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_464",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "einsum_default_443",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_465",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "permute_851",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1157",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_851",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "einsum_default_444",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_443",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "permute_852",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_852",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "dtype_cast_432",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_432",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wo",
-      "name": "alias_default_1569",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_444",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_1172",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1172",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "permute_853",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_853",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_460",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_461",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_462",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_463",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_145",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_150",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_151",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_15",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.sdpa",
-      "name": "getitem_333",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.sdpa",
-      "name": "getitem_334",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.sdpa",
-      "name": "getitem_335",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_335",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "permute_854",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_334",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "permute_855",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_333",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "permute_856",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_854",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_1173",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1173",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "sum_95",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_95",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "squeeze_30",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_855",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_1174",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1174",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "sum_96",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_96",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "squeeze_31",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "convert_element_type_1480",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_856",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "convert_element_type_1481",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1480",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_1175",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1175",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_as_complex_94",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_459",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "_conj_30",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_30",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "clone_190",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_94",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_190",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "mul_544",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1481",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_1176",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1176",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_as_complex_95",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_459",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "_conj_31",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_31",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "clone_191",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_95",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_191",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "mul_545",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_544",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_as_real_94",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_94",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_1177",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1177",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "convert_element_type_1482",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_545",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_as_real_95",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_95",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_1178",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1178",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "convert_element_type_1483",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_1179",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1482",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_1180",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1483",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "view_1181",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1179",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "alias_default_1158",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1158",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_455",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wv",
-      "name": "einsum_default_445",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_458",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wv",
-      "name": "permute_859",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1158",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_859",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wv",
-      "name": "einsum_default_446",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_445",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wv",
-      "name": "permute_860",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_860",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wv",
-      "name": "dtype_cast_433",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_433",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wv",
-      "name": "alias_default_1568",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1180",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "alias_default_1159",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1159",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_455",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wk",
-      "name": "einsum_default_447",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_457",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wk",
-      "name": "permute_863",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1159",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_863",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wk",
-      "name": "einsum_default_448",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_446",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_448",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "add_270",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_447",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wk",
-      "name": "permute_864",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_864",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wk",
-      "name": "dtype_cast_434",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_434",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wk",
-      "name": "alias_default_1567",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1181",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention",
-      "name": "alias_default_1160",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1160",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_455",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wq",
-      "name": "einsum_default_449",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_456",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wq",
-      "name": "permute_867",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1160",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_867",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wq",
-      "name": "einsum_default_450",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_270",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_450",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16",
-      "name": "add_271",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_449",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wq",
-      "name": "permute_868",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_868",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wq",
-      "name": "dtype_cast_435",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_435",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention.wq",
-      "name": "alias_default_1566",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_271",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "convert_element_type_1496",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_451",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "convert_element_type_1497",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_452",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "convert_element_type_1498",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1496",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "alias_default_1161",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1161",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1498",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "mul_546",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1497",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_454",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "mul_547",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_546",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "alias_default_1162",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_547",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "alias_default_1163",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1163",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1162",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "mul_548",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_548",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "sum_97",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1163",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "div_64",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_64",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_97",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "mul_549",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1162",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_549",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "sub_48",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_48",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_454",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "mul_550",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1161",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1163",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "mul_551",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_551",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "sum_98",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_550",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "convert_element_type_1499",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_98",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "convert_element_type_1500",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1157",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1499",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "add_272",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1500",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "dtype_cast_436",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_436",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.16.attention_norm",
-      "name": "alias_default_1573",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_272",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "alias_default_1164",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1164",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_449",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "einsum_default_451",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_450",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "permute_871",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1164",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_871",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "einsum_default_452",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_451",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "permute_872",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_872",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "dtype_cast_437",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_437",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "alias_default_1562",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_452",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w2",
-      "name": "alias_default_1165",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1165",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_446",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "mul_552",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1165",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_448",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "mul_553",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_552",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "alias_default_1166",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1166",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_442",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "einsum_default_453",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_447",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "permute_875",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1166",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_875",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "einsum_default_454",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_453",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "permute_876",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_876",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "dtype_cast_438",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_438",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w3",
-      "name": "alias_default_1563",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_553",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "convert_element_type_1509",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_444",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "convert_element_type_1510",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1510",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "alias_default_1167",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1167",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "neg_48",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "exp_48",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "add_273",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_273",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "reciprocal_16",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_16",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "mul_554",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_554",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "alias_default_1168",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1509",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1168",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "mul_555",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1168",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "sub_49",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1167",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_49",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "mul_556",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_556",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "add_274",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_555",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_274",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "mul_557",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_557",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "convert_element_type_1511",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1511",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward",
-      "name": "alias_default_1169",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1169",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_442",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "einsum_default_455",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_443",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "permute_879",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1169",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_879",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "einsum_default_456",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_454",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_456",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15",
-      "name": "add_275",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_455",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "permute_880",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_880",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "dtype_cast_439",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_439",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.feed_forward.w1",
-      "name": "alias_default_1561",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_275",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "convert_element_type_1516",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_438",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "convert_element_type_1517",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_439",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "convert_element_type_1518",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1516",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "alias_default_1170",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1170",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1518",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "mul_558",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1517",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_441",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "mul_559",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_558",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "alias_default_1171",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_559",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "alias_default_1172",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1172",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "mul_560",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_560",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "sum_99",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1172",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "div_65",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_65",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_99",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "mul_561",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_561",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "sub_50",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_50",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_441",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "mul_562",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1170",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1172",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "mul_563",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_563",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "sum_100",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_562",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "convert_element_type_1519",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_100",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "convert_element_type_1520",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1164",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1519",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "add_276",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1520",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "dtype_cast_440",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_440",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.ffn_norm",
-      "name": "alias_default_1565",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_276",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "alias_default_1173",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1173",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_436",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "einsum_default_457",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_437",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "permute_883",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1173",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_883",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "einsum_default_458",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_457",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "permute_884",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_884",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "dtype_cast_441",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_441",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wo",
-      "name": "alias_default_1560",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_458",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_1196",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1196",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "permute_885",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_885",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_432",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_433",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_434",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_435",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_136",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_141",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_142",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_16",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.sdpa",
-      "name": "getitem_336",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.sdpa",
-      "name": "getitem_337",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.sdpa",
-      "name": "getitem_338",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_338",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "permute_886",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_337",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "permute_887",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_336",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "permute_888",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_886",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_1197",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1197",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "sum_101",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_101",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "squeeze_32",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_887",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_1198",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1198",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "sum_102",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_102",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "squeeze_33",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_33",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "convert_element_type_1525",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_888",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "convert_element_type_1526",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1525",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_1199",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1199",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_as_complex_96",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_431",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "_conj_32",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_32",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "clone_198",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_96",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_198",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "mul_564",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1526",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_1200",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1200",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_as_complex_97",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_431",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "_conj_33",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_33",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "clone_199",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_97",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_199",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "mul_565",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_564",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_as_real_96",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_96",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_1201",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1201",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "convert_element_type_1527",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_565",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_as_real_97",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_97",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_1202",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1202",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "convert_element_type_1528",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_32",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_1203",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1527",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_1204",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1528",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "view_1205",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1203",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "alias_default_1174",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1174",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_427",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wv",
-      "name": "einsum_default_459",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_430",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wv",
-      "name": "permute_891",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1174",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_891",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wv",
-      "name": "einsum_default_460",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_459",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wv",
-      "name": "permute_892",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_892",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wv",
-      "name": "dtype_cast_442",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_442",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wv",
-      "name": "alias_default_1559",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1204",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "alias_default_1175",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1175",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_427",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wk",
-      "name": "einsum_default_461",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_429",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wk",
-      "name": "permute_895",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1175",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_895",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wk",
-      "name": "einsum_default_462",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_460",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_462",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "add_277",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_461",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wk",
-      "name": "permute_896",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_896",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wk",
-      "name": "dtype_cast_443",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_443",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wk",
-      "name": "alias_default_1558",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1205",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention",
-      "name": "alias_default_1176",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1176",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_427",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wq",
-      "name": "einsum_default_463",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_428",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wq",
-      "name": "permute_899",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1176",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_899",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wq",
-      "name": "einsum_default_464",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_277",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_464",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15",
-      "name": "add_278",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_463",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wq",
-      "name": "permute_900",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_900",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wq",
-      "name": "dtype_cast_444",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_444",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention.wq",
-      "name": "alias_default_1557",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_278",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "convert_element_type_1541",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_423",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "convert_element_type_1542",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_424",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "convert_element_type_1543",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1541",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "alias_default_1177",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1177",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1543",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "mul_566",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1542",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_426",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "mul_567",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_566",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "alias_default_1178",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_567",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "alias_default_1179",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1179",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1178",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "mul_568",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_568",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "sum_103",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1179",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "div_66",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_66",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_103",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "mul_569",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1178",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_569",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "sub_51",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_51",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_426",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "mul_570",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1177",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1179",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "mul_571",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_571",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "sum_104",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_570",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "convert_element_type_1544",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_104",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "convert_element_type_1545",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1173",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1544",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "add_279",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1545",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "dtype_cast_445",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_445",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.15.attention_norm",
-      "name": "alias_default_1564",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_279",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "alias_default_1180",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1180",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_421",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "einsum_default_465",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_422",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "permute_903",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1180",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_903",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "einsum_default_466",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_465",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "permute_904",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_904",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "dtype_cast_446",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_446",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "alias_default_1553",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_466",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w2",
-      "name": "alias_default_1181",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1181",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_418",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "mul_572",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1181",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_420",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "mul_573",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_572",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "alias_default_1182",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1182",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_414",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "einsum_default_467",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_419",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "permute_907",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1182",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_907",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "einsum_default_468",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_467",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "permute_908",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_908",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "dtype_cast_447",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_447",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w3",
-      "name": "alias_default_1554",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_573",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "convert_element_type_1554",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_416",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "convert_element_type_1555",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1555",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "alias_default_1183",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1183",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "neg_49",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_49",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "exp_49",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_49",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "add_280",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_280",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "reciprocal_17",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_17",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "mul_574",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_574",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "alias_default_1184",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1554",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1184",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "mul_575",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1184",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "sub_52",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1183",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "mul_576",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_576",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "add_281",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_575",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_281",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "mul_577",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_577",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "convert_element_type_1556",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1556",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward",
-      "name": "alias_default_1185",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1185",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_414",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "einsum_default_469",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_415",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "permute_911",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1185",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_911",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "einsum_default_470",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_468",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_470",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14",
-      "name": "add_282",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_469",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "permute_912",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_912",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "dtype_cast_448",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_448",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.feed_forward.w1",
-      "name": "alias_default_1552",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_282",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "convert_element_type_1561",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_410",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "convert_element_type_1562",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_411",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "convert_element_type_1563",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1561",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "alias_default_1186",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1186",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1563",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "mul_578",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1562",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_413",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "mul_579",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_578",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "alias_default_1187",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_579",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "alias_default_1188",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1188",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1187",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "mul_580",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_580",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "sum_105",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1188",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "div_67",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_67",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_105",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "mul_581",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1187",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_581",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "sub_53",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_53",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_413",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "mul_582",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1186",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1188",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "mul_583",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_583",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "sum_106",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_582",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "convert_element_type_1564",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_106",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "convert_element_type_1565",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1180",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1564",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "add_283",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1565",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "dtype_cast_449",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_449",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.ffn_norm",
-      "name": "alias_default_1556",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_283",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "alias_default_1189",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1189",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_408",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "einsum_default_471",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_409",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "permute_915",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1189",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_915",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "einsum_default_472",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_471",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "permute_916",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_916",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "dtype_cast_450",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_450",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wo",
-      "name": "alias_default_1551",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_472",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_1220",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1220",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "permute_917",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_917",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_404",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_405",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_406",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_407",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_127",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_132",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_133",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_17",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.sdpa",
-      "name": "getitem_339",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.sdpa",
-      "name": "getitem_340",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_17",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.sdpa",
-      "name": "getitem_341",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_341",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "permute_918",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_340",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "permute_919",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_339",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "permute_920",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_918",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_1221",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1221",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "sum_107",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_107",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "squeeze_34",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_919",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_1222",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1222",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "sum_108",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_108",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "squeeze_35",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_35",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "convert_element_type_1570",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_920",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "convert_element_type_1571",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1570",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_1223",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1223",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_as_complex_98",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_403",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "_conj_34",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_34",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "clone_206",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_98",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_206",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "mul_584",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1571",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_1224",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1224",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_as_complex_99",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_403",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "_conj_35",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_35",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "clone_207",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_99",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_207",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "mul_585",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_584",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_as_real_98",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_98",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_1225",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1225",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "convert_element_type_1572",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_585",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_as_real_99",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_99",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_1226",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1226",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "convert_element_type_1573",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_34",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_1227",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1572",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_1228",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1573",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "view_1229",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1227",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "alias_default_1190",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1190",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_399",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wv",
-      "name": "einsum_default_473",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_402",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wv",
-      "name": "permute_923",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1190",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_923",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wv",
-      "name": "einsum_default_474",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_473",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wv",
-      "name": "permute_924",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_924",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wv",
-      "name": "dtype_cast_451",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_451",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wv",
-      "name": "alias_default_1550",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1228",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "alias_default_1191",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1191",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_399",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wk",
-      "name": "einsum_default_475",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_401",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wk",
-      "name": "permute_927",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1191",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_927",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wk",
-      "name": "einsum_default_476",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_474",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_476",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "add_284",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_475",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wk",
-      "name": "permute_928",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_928",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wk",
-      "name": "dtype_cast_452",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_452",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wk",
-      "name": "alias_default_1549",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1229",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention",
-      "name": "alias_default_1192",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1192",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_399",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wq",
-      "name": "einsum_default_477",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_400",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wq",
-      "name": "permute_931",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1192",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_931",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wq",
-      "name": "einsum_default_478",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_284",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_478",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14",
-      "name": "add_285",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_477",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wq",
-      "name": "permute_932",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_932",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wq",
-      "name": "dtype_cast_453",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_453",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention.wq",
-      "name": "alias_default_1548",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_285",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "convert_element_type_1586",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_395",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "convert_element_type_1587",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_396",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "convert_element_type_1588",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1586",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "alias_default_1193",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1193",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1588",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "mul_586",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1587",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_398",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "mul_587",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_586",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "alias_default_1194",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_587",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "alias_default_1195",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1195",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1194",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "mul_588",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_588",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "sum_109",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1195",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "div_68",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_68",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_109",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "mul_589",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1194",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_589",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "sub_54",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_54",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_398",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "mul_590",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1193",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1195",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "mul_591",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_591",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "sum_110",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_590",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "convert_element_type_1589",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_110",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "convert_element_type_1590",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1189",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1589",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "add_286",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1590",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "dtype_cast_454",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_454",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.14.attention_norm",
-      "name": "alias_default_1555",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_286",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "alias_default_1196",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1196",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_393",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "einsum_default_479",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_394",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "permute_935",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1196",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_935",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "einsum_default_480",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_479",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "permute_936",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_936",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "dtype_cast_455",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_455",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "alias_default_1544",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_480",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w2",
-      "name": "alias_default_1197",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1197",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_390",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "mul_592",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1197",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_392",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "mul_593",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_592",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "alias_default_1198",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1198",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_386",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "einsum_default_481",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_391",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "permute_939",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1198",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_939",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "einsum_default_482",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_481",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "permute_940",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_940",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "dtype_cast_456",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_456",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w3",
-      "name": "alias_default_1545",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_593",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "convert_element_type_1599",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_388",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "convert_element_type_1600",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1600",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "alias_default_1199",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1199",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "neg_50",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_50",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "exp_50",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_50",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "add_287",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_287",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "reciprocal_18",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_18",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "mul_594",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_594",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "alias_default_1200",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1599",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1200",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "mul_595",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1200",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "sub_55",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1199",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_55",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "mul_596",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_596",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "add_288",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_595",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_288",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "mul_597",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_597",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "convert_element_type_1601",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1601",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward",
-      "name": "alias_default_1201",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1201",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_386",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "einsum_default_483",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_387",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "permute_943",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1201",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_943",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "einsum_default_484",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_482",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_484",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13",
-      "name": "add_289",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_483",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "permute_944",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_944",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "dtype_cast_457",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_457",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.feed_forward.w1",
-      "name": "alias_default_1543",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_289",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "convert_element_type_1606",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_382",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "convert_element_type_1607",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_383",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "convert_element_type_1608",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1606",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "alias_default_1202",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1202",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1608",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "mul_598",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1607",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_385",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "mul_599",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_598",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "alias_default_1203",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_599",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "alias_default_1204",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1204",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1203",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "mul_600",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_600",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "sum_111",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1204",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "div_69",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_69",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_111",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "mul_601",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1203",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_601",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "sub_56",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_56",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_385",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "mul_602",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1202",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1204",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "mul_603",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_603",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "sum_112",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_602",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "convert_element_type_1609",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_112",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "convert_element_type_1610",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1196",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1609",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "add_290",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1610",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "dtype_cast_458",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_458",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.ffn_norm",
-      "name": "alias_default_1547",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_290",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "alias_default_1205",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1205",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_380",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "einsum_default_485",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_381",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "permute_947",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1205",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_947",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "einsum_default_486",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_485",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "permute_948",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_948",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "dtype_cast_459",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_459",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wo",
-      "name": "alias_default_1542",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_486",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_1244",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1244",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "permute_949",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_949",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_376",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_377",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_378",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_379",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_118",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_123",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_124",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_18",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.sdpa",
-      "name": "getitem_342",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.sdpa",
-      "name": "getitem_343",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.sdpa",
-      "name": "getitem_344",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_344",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "permute_950",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_343",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "permute_951",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_342",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "permute_952",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_950",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_1245",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1245",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "sum_113",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_113",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "squeeze_36",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_951",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_1246",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1246",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "sum_114",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_114",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "squeeze_37",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_37",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "convert_element_type_1615",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_952",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "convert_element_type_1616",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1615",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_1247",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1247",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_as_complex_100",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_375",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "_conj_36",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_36",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "clone_214",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_100",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_214",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "mul_604",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1616",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_1248",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1248",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_as_complex_101",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_375",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "_conj_37",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_37",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "clone_215",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_101",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_215",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "mul_605",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_604",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_as_real_100",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_100",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_1249",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1249",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "convert_element_type_1617",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_605",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_as_real_101",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_101",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_1250",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1250",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "convert_element_type_1618",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_36",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_1251",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1617",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_1252",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1618",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "view_1253",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1251",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "alias_default_1206",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1206",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_371",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wv",
-      "name": "einsum_default_487",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_374",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wv",
-      "name": "permute_955",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1206",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_955",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wv",
-      "name": "einsum_default_488",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_487",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wv",
-      "name": "permute_956",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_956",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wv",
-      "name": "dtype_cast_460",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_460",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wv",
-      "name": "alias_default_1541",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1252",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "alias_default_1207",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1207",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_371",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wk",
-      "name": "einsum_default_489",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_373",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wk",
-      "name": "permute_959",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1207",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_959",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wk",
-      "name": "einsum_default_490",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_488",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_490",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "add_291",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_489",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wk",
-      "name": "permute_960",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_960",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wk",
-      "name": "dtype_cast_461",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_461",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wk",
-      "name": "alias_default_1540",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1253",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention",
-      "name": "alias_default_1208",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1208",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_371",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wq",
-      "name": "einsum_default_491",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_372",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wq",
-      "name": "permute_963",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1208",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_963",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wq",
-      "name": "einsum_default_492",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_291",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_492",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13",
-      "name": "add_292",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_491",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wq",
-      "name": "permute_964",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_964",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wq",
-      "name": "dtype_cast_462",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_462",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention.wq",
-      "name": "alias_default_1539",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_292",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "convert_element_type_1631",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_367",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "convert_element_type_1632",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_368",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "convert_element_type_1633",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1631",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "alias_default_1209",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1209",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1633",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "mul_606",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1632",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_370",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "mul_607",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_606",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "alias_default_1210",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_607",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "alias_default_1211",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1211",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1210",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "mul_608",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_608",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "sum_115",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1211",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "div_70",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_70",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "mul_609",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1210",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_609",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "sub_57",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_57",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_370",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "mul_610",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1209",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1211",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "mul_611",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_611",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "sum_116",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_610",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "convert_element_type_1634",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_116",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "convert_element_type_1635",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1205",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1634",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "add_293",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1635",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "dtype_cast_463",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_463",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.13.attention_norm",
-      "name": "alias_default_1546",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_293",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "alias_default_1212",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1212",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_365",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "einsum_default_493",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_366",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "permute_967",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1212",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_967",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "einsum_default_494",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_493",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "permute_968",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_968",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "dtype_cast_464",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_464",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "alias_default_1535",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_494",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w2",
-      "name": "alias_default_1213",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1213",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_362",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "mul_612",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1213",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_364",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "mul_613",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_612",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "alias_default_1214",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1214",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_358",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "einsum_default_495",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_363",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "permute_971",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1214",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_971",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "einsum_default_496",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_495",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "permute_972",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_972",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "dtype_cast_465",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_465",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w3",
-      "name": "alias_default_1536",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_613",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "convert_element_type_1644",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_360",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "convert_element_type_1645",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1645",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "alias_default_1215",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1215",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "neg_51",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_51",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "exp_51",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_51",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "add_294",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_294",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "reciprocal_19",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_19",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "mul_614",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_614",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "alias_default_1216",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1644",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1216",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "mul_615",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1216",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "sub_58",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1215",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "mul_616",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_616",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "add_295",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_615",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_295",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "mul_617",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_617",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "convert_element_type_1646",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1646",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward",
-      "name": "alias_default_1217",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1217",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_358",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "einsum_default_497",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_359",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "permute_975",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1217",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_975",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "einsum_default_498",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_496",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_498",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12",
-      "name": "add_296",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_497",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "permute_976",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_976",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "dtype_cast_466",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_466",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.feed_forward.w1",
-      "name": "alias_default_1534",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_296",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "convert_element_type_1651",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_354",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "convert_element_type_1652",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_355",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "convert_element_type_1653",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1651",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "alias_default_1218",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1218",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1653",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "mul_618",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1652",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_357",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "mul_619",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_618",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "alias_default_1219",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_619",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "alias_default_1220",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1220",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1219",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "mul_620",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_620",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "sum_117",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1220",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "div_71",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_71",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_117",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "mul_621",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1219",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_621",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "sub_59",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_357",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "mul_622",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1218",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1220",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "mul_623",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_623",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "sum_118",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_622",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "convert_element_type_1654",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_118",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "convert_element_type_1655",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1212",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1654",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "add_297",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1655",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "dtype_cast_467",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_467",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.ffn_norm",
-      "name": "alias_default_1538",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_297",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "alias_default_1221",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1221",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_352",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "einsum_default_499",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_353",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "permute_979",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1221",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_979",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "einsum_default_500",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_499",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "permute_980",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_980",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "dtype_cast_468",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_468",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wo",
-      "name": "alias_default_1533",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_500",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_1268",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1268",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "permute_981",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_981",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_348",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_349",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_350",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_351",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_109",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_114",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_115",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_19",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.sdpa",
-      "name": "getitem_345",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.sdpa",
-      "name": "getitem_346",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.sdpa",
-      "name": "getitem_347",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_347",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "permute_982",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_346",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "permute_983",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_345",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "permute_984",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_982",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_1269",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1269",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "sum_119",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_119",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "squeeze_38",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_983",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_1270",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1270",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "sum_120",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_120",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "squeeze_39",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_39",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "convert_element_type_1660",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_984",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "convert_element_type_1661",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1660",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_1271",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1271",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_as_complex_102",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_347",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "_conj_38",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_38",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "clone_222",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_102",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_222",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "mul_624",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1661",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_1272",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1272",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_as_complex_103",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_347",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "_conj_39",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_39",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "clone_223",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_103",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_223",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "mul_625",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_624",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_as_real_102",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_102",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_1273",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1273",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "convert_element_type_1662",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_625",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_as_real_103",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_103",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_1274",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1274",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "convert_element_type_1663",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_38",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_1275",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1662",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_1276",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1663",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "view_1277",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1275",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "alias_default_1222",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1222",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_343",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wv",
-      "name": "einsum_default_501",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_346",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wv",
-      "name": "permute_987",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1222",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_987",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wv",
-      "name": "einsum_default_502",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_501",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wv",
-      "name": "permute_988",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_988",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wv",
-      "name": "dtype_cast_469",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_469",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wv",
-      "name": "alias_default_1532",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1276",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "alias_default_1223",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1223",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_343",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wk",
-      "name": "einsum_default_503",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_345",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wk",
-      "name": "permute_991",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1223",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_991",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wk",
-      "name": "einsum_default_504",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_502",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_504",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "add_298",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_503",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wk",
-      "name": "permute_992",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_992",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wk",
-      "name": "dtype_cast_470",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_470",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wk",
-      "name": "alias_default_1531",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1277",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention",
-      "name": "alias_default_1224",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1224",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_343",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wq",
-      "name": "einsum_default_505",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_344",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wq",
-      "name": "permute_995",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1224",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_995",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wq",
-      "name": "einsum_default_506",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_298",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_506",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12",
-      "name": "add_299",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_505",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wq",
-      "name": "permute_996",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_996",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wq",
-      "name": "dtype_cast_471",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_471",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention.wq",
-      "name": "alias_default_1530",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_299",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "convert_element_type_1676",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_339",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "convert_element_type_1677",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_340",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "convert_element_type_1678",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1676",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "alias_default_1225",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1225",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1678",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "mul_626",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1677",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_342",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "mul_627",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_626",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "alias_default_1226",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_627",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "alias_default_1227",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1227",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1226",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "mul_628",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_628",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "sum_121",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1227",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "div_72",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_72",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_121",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "mul_629",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1226",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_629",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "sub_60",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_60",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_342",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "mul_630",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1225",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1227",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "mul_631",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_631",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "sum_122",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_630",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "convert_element_type_1679",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_122",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "convert_element_type_1680",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1221",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1679",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "add_300",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1680",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "dtype_cast_472",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_472",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.12.attention_norm",
-      "name": "alias_default_1537",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_300",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "alias_default_1228",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1228",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_337",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "einsum_default_507",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_338",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "permute_999",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1228",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_999",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "einsum_default_508",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_507",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "permute_1000",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1000",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "dtype_cast_473",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_473",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "alias_default_1526",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_508",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w2",
-      "name": "alias_default_1229",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1229",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_334",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "mul_632",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1229",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_336",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "mul_633",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_632",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "alias_default_1230",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1230",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_330",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "einsum_default_509",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_335",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "permute_1003",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1230",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1003",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "einsum_default_510",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_509",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "permute_1004",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1004",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "dtype_cast_474",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_474",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w3",
-      "name": "alias_default_1527",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_633",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "convert_element_type_1689",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_332",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "convert_element_type_1690",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1690",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "alias_default_1231",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1231",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "neg_52",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "exp_52",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "add_301",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_301",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "reciprocal_20",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_20",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "mul_634",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_634",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "alias_default_1232",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1689",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1232",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "mul_635",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1232",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "sub_61",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1231",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "mul_636",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_636",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "add_302",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_635",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_302",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "mul_637",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_637",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "convert_element_type_1691",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1691",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward",
-      "name": "alias_default_1233",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1233",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_330",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "einsum_default_511",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_331",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "permute_1007",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1233",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1007",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "einsum_default_512",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_510",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_512",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11",
-      "name": "add_303",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_511",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "permute_1008",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1008",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "dtype_cast_475",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_475",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.feed_forward.w1",
-      "name": "alias_default_1525",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_303",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "convert_element_type_1696",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_326",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "convert_element_type_1697",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_327",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "convert_element_type_1698",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1696",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "alias_default_1234",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1234",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1698",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "mul_638",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1697",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_329",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "mul_639",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_638",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "alias_default_1235",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_639",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "alias_default_1236",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1236",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1235",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "mul_640",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_640",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "sum_123",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1236",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "div_73",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_73",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_123",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "mul_641",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1235",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_641",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "sub_62",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_62",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_329",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "mul_642",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1234",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1236",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "mul_643",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_643",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "sum_124",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_642",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "convert_element_type_1699",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_124",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "convert_element_type_1700",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1228",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1699",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "add_304",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1700",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "dtype_cast_476",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_476",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.ffn_norm",
-      "name": "alias_default_1529",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_304",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "alias_default_1237",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1237",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_324",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "einsum_default_513",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_325",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "permute_1011",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1237",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1011",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "einsum_default_514",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_513",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "permute_1012",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1012",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "dtype_cast_477",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_477",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wo",
-      "name": "alias_default_1524",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_514",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_1292",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1292",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "permute_1013",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1013",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_320",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_321",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_322",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_323",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_100",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_105",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_106",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_20",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.sdpa",
-      "name": "getitem_348",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.sdpa",
-      "name": "getitem_349",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_20",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.sdpa",
-      "name": "getitem_350",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_350",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "permute_1014",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_349",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "permute_1015",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_348",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "permute_1016",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1014",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_1293",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1293",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "sum_125",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_125",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "squeeze_40",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1015",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_1294",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1294",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "sum_126",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_126",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "squeeze_41",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_41",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "convert_element_type_1705",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1016",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "convert_element_type_1706",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1705",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_1295",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1295",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_as_complex_104",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_319",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "_conj_40",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_40",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "clone_230",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_104",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_230",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "mul_644",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1706",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_1296",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1296",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_as_complex_105",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_319",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "_conj_41",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_41",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "clone_231",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_105",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_231",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "mul_645",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_644",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_as_real_104",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_104",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_1297",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1297",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "convert_element_type_1707",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_645",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_as_real_105",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_105",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_1298",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1298",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "convert_element_type_1708",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_40",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_1299",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1707",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_1300",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1708",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "view_1301",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1299",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "alias_default_1238",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1238",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_315",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wv",
-      "name": "einsum_default_515",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_318",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wv",
-      "name": "permute_1019",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1238",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1019",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wv",
-      "name": "einsum_default_516",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_515",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wv",
-      "name": "permute_1020",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1020",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wv",
-      "name": "dtype_cast_478",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_478",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wv",
-      "name": "alias_default_1523",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1300",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "alias_default_1239",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1239",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_315",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wk",
-      "name": "einsum_default_517",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_317",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wk",
-      "name": "permute_1023",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1239",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1023",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wk",
-      "name": "einsum_default_518",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_516",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_518",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "add_305",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_517",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wk",
-      "name": "permute_1024",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1024",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wk",
-      "name": "dtype_cast_479",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_479",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wk",
-      "name": "alias_default_1522",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1301",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention",
-      "name": "alias_default_1240",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1240",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_315",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wq",
-      "name": "einsum_default_519",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_316",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wq",
-      "name": "permute_1027",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1240",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1027",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wq",
-      "name": "einsum_default_520",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_305",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_520",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11",
-      "name": "add_306",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_519",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wq",
-      "name": "permute_1028",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1028",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wq",
-      "name": "dtype_cast_480",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_480",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention.wq",
-      "name": "alias_default_1521",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_306",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "convert_element_type_1721",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_311",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "convert_element_type_1722",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_312",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "convert_element_type_1723",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1721",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "alias_default_1241",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1241",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1723",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "mul_646",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1722",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_314",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "mul_647",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_646",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "alias_default_1242",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_647",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "alias_default_1243",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1243",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1242",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "mul_648",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_648",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "sum_127",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1243",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "div_74",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_74",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_127",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "mul_649",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1242",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_649",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "sub_63",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_63",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_314",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "mul_650",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1241",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1243",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "mul_651",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_651",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "sum_128",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_650",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "convert_element_type_1724",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_128",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "convert_element_type_1725",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1237",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1724",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "add_307",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1725",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "dtype_cast_481",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_481",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.11.attention_norm",
-      "name": "alias_default_1528",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_307",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "alias_default_1244",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1244",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_309",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "einsum_default_521",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_310",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "permute_1031",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1244",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1031",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "einsum_default_522",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_521",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "permute_1032",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1032",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "dtype_cast_482",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_482",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "alias_default_1517",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_522",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w2",
-      "name": "alias_default_1245",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1245",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_306",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "mul_652",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1245",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_308",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "mul_653",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_652",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "alias_default_1246",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1246",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_302",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "einsum_default_523",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_307",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "permute_1035",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1246",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1035",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "einsum_default_524",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_523",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "permute_1036",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1036",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "dtype_cast_483",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_483",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w3",
-      "name": "alias_default_1518",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_653",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "convert_element_type_1734",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_304",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "convert_element_type_1735",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1735",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "alias_default_1247",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1247",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "neg_53",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "exp_53",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "add_308",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_308",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "reciprocal_21",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_21",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "mul_654",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_654",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "alias_default_1248",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1734",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1248",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "mul_655",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1248",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "sub_64",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1247",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_64",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "mul_656",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_656",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "add_309",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_655",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_309",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "mul_657",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_657",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "convert_element_type_1736",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1736",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward",
-      "name": "alias_default_1249",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1249",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_302",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "einsum_default_525",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_303",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "permute_1039",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1249",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1039",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "einsum_default_526",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_524",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_526",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10",
-      "name": "add_310",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_525",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "permute_1040",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1040",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "dtype_cast_484",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_484",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.feed_forward.w1",
-      "name": "alias_default_1516",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_310",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "convert_element_type_1741",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_298",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "convert_element_type_1742",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_299",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "convert_element_type_1743",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1741",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "alias_default_1250",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1250",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1743",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "mul_658",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1742",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_301",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "mul_659",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_658",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "alias_default_1251",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_659",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "alias_default_1252",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1252",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1251",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "mul_660",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_660",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "sum_129",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1252",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "div_75",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_75",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_129",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "mul_661",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1251",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_661",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "sub_65",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_65",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_301",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "mul_662",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1250",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1252",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "mul_663",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_663",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "sum_130",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_662",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "convert_element_type_1744",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_130",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "convert_element_type_1745",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1244",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1744",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "add_311",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1745",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "dtype_cast_485",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_485",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.ffn_norm",
-      "name": "alias_default_1520",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_311",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "alias_default_1253",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1253",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_296",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "einsum_default_527",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_297",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "permute_1043",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1253",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1043",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "einsum_default_528",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_527",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "permute_1044",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1044",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "dtype_cast_486",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_486",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wo",
-      "name": "alias_default_1515",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_528",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_1316",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1316",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "permute_1045",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1045",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_292",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_293",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_294",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_295",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_91",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_96",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_97",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_21",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.sdpa",
-      "name": "getitem_351",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.sdpa",
-      "name": "getitem_352",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.sdpa",
-      "name": "getitem_353",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_353",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "permute_1046",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_352",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "permute_1047",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_351",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "permute_1048",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1046",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_1317",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1317",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "sum_131",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_131",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "squeeze_42",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1047",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_1318",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1318",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "sum_132",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_132",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "squeeze_43",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_43",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "convert_element_type_1750",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1048",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "convert_element_type_1751",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1750",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_1319",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1319",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_as_complex_106",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_291",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "_conj_42",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_42",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "clone_238",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_106",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_238",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "mul_664",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1751",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_1320",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1320",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_as_complex_107",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_291",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "_conj_43",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_43",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "clone_239",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_107",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_239",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "mul_665",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_664",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_as_real_106",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_106",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_1321",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1321",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "convert_element_type_1752",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_665",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_as_real_107",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_107",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_1322",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1322",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "convert_element_type_1753",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_42",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_1323",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1752",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_1324",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1753",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "view_1325",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1323",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "alias_default_1254",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1254",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_287",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wv",
-      "name": "einsum_default_529",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_290",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wv",
-      "name": "permute_1051",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1254",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1051",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wv",
-      "name": "einsum_default_530",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_529",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wv",
-      "name": "permute_1052",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1052",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wv",
-      "name": "dtype_cast_487",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_487",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wv",
-      "name": "alias_default_1514",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1324",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "alias_default_1255",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1255",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_287",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wk",
-      "name": "einsum_default_531",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_289",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wk",
-      "name": "permute_1055",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1255",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1055",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wk",
-      "name": "einsum_default_532",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_530",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_532",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "add_312",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_531",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wk",
-      "name": "permute_1056",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1056",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wk",
-      "name": "dtype_cast_488",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_488",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wk",
-      "name": "alias_default_1513",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1325",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention",
-      "name": "alias_default_1256",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1256",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_287",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wq",
-      "name": "einsum_default_533",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_288",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wq",
-      "name": "permute_1059",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1256",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1059",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wq",
-      "name": "einsum_default_534",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_312",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_534",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10",
-      "name": "add_313",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_533",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wq",
-      "name": "permute_1060",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1060",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wq",
-      "name": "dtype_cast_489",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_489",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention.wq",
-      "name": "alias_default_1512",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_313",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "convert_element_type_1766",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_283",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "convert_element_type_1767",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_284",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "convert_element_type_1768",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1766",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "alias_default_1257",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1257",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1768",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "mul_666",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1767",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_286",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "mul_667",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_666",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "alias_default_1258",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_667",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "alias_default_1259",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1259",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1258",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "mul_668",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_668",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "sum_133",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1259",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "div_76",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_76",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_133",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "mul_669",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1258",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_669",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "sub_66",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_66",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_286",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "mul_670",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1257",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1259",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "mul_671",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_671",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "sum_134",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_670",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "convert_element_type_1769",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_134",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "convert_element_type_1770",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1253",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1769",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "add_314",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1770",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "dtype_cast_490",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_490",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.10.attention_norm",
-      "name": "alias_default_1519",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_314",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "alias_default_1260",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1260",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_281",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "einsum_default_535",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_282",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "permute_1063",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1260",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1063",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "einsum_default_536",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_535",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "permute_1064",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1064",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "dtype_cast_491",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_491",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "alias_default_1508",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_536",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w2",
-      "name": "alias_default_1261",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1261",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_278",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "mul_672",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1261",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_280",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "mul_673",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_672",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "alias_default_1262",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1262",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_274",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "einsum_default_537",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_279",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "permute_1067",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1262",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1067",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "einsum_default_538",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_537",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "permute_1068",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1068",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "dtype_cast_492",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_492",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w3",
-      "name": "alias_default_1509",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_673",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "convert_element_type_1779",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_276",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "convert_element_type_1780",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1780",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "alias_default_1263",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1263",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "neg_54",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "exp_54",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "add_315",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_315",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "reciprocal_22",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_22",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "mul_674",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_674",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "alias_default_1264",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1779",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1264",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "mul_675",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1264",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "sub_67",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1263",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_67",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "mul_676",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_676",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "add_316",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_675",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_316",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "mul_677",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_677",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "convert_element_type_1781",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1781",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward",
-      "name": "alias_default_1265",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1265",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_274",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "einsum_default_539",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_275",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "permute_1071",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1265",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1071",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "einsum_default_540",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_538",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_540",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9",
-      "name": "add_317",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_539",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "permute_1072",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1072",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "dtype_cast_493",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_493",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.feed_forward.w1",
-      "name": "alias_default_1507",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_317",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "convert_element_type_1786",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_270",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "convert_element_type_1787",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_271",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "convert_element_type_1788",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1786",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "alias_default_1266",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1266",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1788",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "mul_678",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1787",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_273",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "mul_679",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_678",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "alias_default_1267",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_679",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "alias_default_1268",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1268",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1267",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "mul_680",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_680",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "sum_135",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1268",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "div_77",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_77",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_135",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "mul_681",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1267",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_681",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "sub_68",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_68",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_273",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "mul_682",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1266",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1268",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "mul_683",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_683",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "sum_136",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_682",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "convert_element_type_1789",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_136",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "convert_element_type_1790",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1260",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1789",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "add_318",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1790",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "dtype_cast_494",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_494",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.ffn_norm",
-      "name": "alias_default_1511",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_318",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "alias_default_1269",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1269",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_268",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "einsum_default_541",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_269",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "permute_1075",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1269",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1075",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "einsum_default_542",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_541",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "permute_1076",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1076",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "dtype_cast_495",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_495",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wo",
-      "name": "alias_default_1506",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_542",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_1340",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1340",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "permute_1077",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1077",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_264",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_265",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_266",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_267",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_82",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_87",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_88",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_22",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_22",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.sdpa",
-      "name": "getitem_354",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_22",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.sdpa",
-      "name": "getitem_355",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_22",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.sdpa",
-      "name": "getitem_356",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_356",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "permute_1078",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_355",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "permute_1079",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_354",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "permute_1080",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1078",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_1341",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1341",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "sum_137",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_137",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "squeeze_44",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1079",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_1342",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1342",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "sum_138",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_138",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "squeeze_45",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_45",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "convert_element_type_1795",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1080",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "convert_element_type_1796",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1795",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_1343",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1343",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_as_complex_108",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_263",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "_conj_44",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_44",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "clone_246",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_108",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_246",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "mul_684",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1796",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_1344",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1344",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_as_complex_109",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_263",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "_conj_45",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_45",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "clone_247",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_109",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_247",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "mul_685",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_684",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_as_real_108",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_108",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_1345",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1345",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "convert_element_type_1797",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_685",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_as_real_109",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_109",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_1346",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1346",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "convert_element_type_1798",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_44",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_1347",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1797",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_1348",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1798",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "view_1349",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1347",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "alias_default_1270",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1270",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_259",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wv",
-      "name": "einsum_default_543",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_262",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wv",
-      "name": "permute_1083",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1270",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1083",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wv",
-      "name": "einsum_default_544",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_543",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wv",
-      "name": "permute_1084",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1084",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wv",
-      "name": "dtype_cast_496",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_496",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wv",
-      "name": "alias_default_1505",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1348",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "alias_default_1271",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1271",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_259",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wk",
-      "name": "einsum_default_545",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_261",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wk",
-      "name": "permute_1087",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1271",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1087",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wk",
-      "name": "einsum_default_546",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_544",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_546",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "add_319",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_545",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wk",
-      "name": "permute_1088",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1088",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wk",
-      "name": "dtype_cast_497",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_497",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wk",
-      "name": "alias_default_1504",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1349",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention",
-      "name": "alias_default_1272",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1272",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_259",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wq",
-      "name": "einsum_default_547",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_260",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wq",
-      "name": "permute_1091",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1272",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1091",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wq",
-      "name": "einsum_default_548",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_319",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_548",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9",
-      "name": "add_320",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_547",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wq",
-      "name": "permute_1092",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1092",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wq",
-      "name": "dtype_cast_498",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_498",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention.wq",
-      "name": "alias_default_1503",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_320",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "convert_element_type_1811",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_255",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "convert_element_type_1812",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_256",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "convert_element_type_1813",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1811",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "alias_default_1273",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1273",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1813",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "mul_686",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1812",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_258",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "mul_687",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_686",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "alias_default_1274",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_687",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "alias_default_1275",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1275",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1274",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "mul_688",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_688",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "sum_139",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1275",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "div_78",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_78",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_139",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "mul_689",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1274",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_689",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "sub_69",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_69",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_258",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "mul_690",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1273",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1275",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "mul_691",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_691",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "sum_140",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_690",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "convert_element_type_1814",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_140",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "convert_element_type_1815",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1269",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1814",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "add_321",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1815",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "dtype_cast_499",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_499",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.9.attention_norm",
-      "name": "alias_default_1510",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_321",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "alias_default_1276",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1276",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_253",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "einsum_default_549",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_254",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "permute_1095",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1276",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1095",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "einsum_default_550",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_549",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "permute_1096",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1096",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "dtype_cast_500",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_500",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "alias_default_1499",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_550",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w2",
-      "name": "alias_default_1277",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1277",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_250",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "mul_692",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1277",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_252",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "mul_693",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_692",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "alias_default_1278",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1278",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_246",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "einsum_default_551",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_251",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "permute_1099",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1278",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1099",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "einsum_default_552",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_551",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "permute_1100",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1100",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "dtype_cast_501",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_501",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w3",
-      "name": "alias_default_1500",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_693",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "convert_element_type_1824",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_248",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "convert_element_type_1825",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1825",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "alias_default_1279",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1279",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "neg_55",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_55",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "exp_55",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_55",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "add_322",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_322",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "reciprocal_23",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_23",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "mul_694",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_694",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "alias_default_1280",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1824",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1280",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "mul_695",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1280",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "sub_70",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1279",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_70",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "mul_696",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_696",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "add_323",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_695",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_323",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "mul_697",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_697",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "convert_element_type_1826",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1826",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward",
-      "name": "alias_default_1281",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1281",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_246",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "einsum_default_553",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_247",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "permute_1103",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1281",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1103",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "einsum_default_554",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_552",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_554",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8",
-      "name": "add_324",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_553",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "permute_1104",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1104",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "dtype_cast_502",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_502",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.feed_forward.w1",
-      "name": "alias_default_1498",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_324",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "convert_element_type_1831",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_242",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "convert_element_type_1832",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_243",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "convert_element_type_1833",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1831",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "alias_default_1282",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1282",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1833",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "mul_698",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1832",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_245",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "mul_699",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_698",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "alias_default_1283",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_699",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "alias_default_1284",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1284",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1283",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "mul_700",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_700",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "sum_141",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1284",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "div_79",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_79",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_141",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "mul_701",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1283",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_701",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "sub_71",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_71",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_245",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "mul_702",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1282",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1284",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "mul_703",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_703",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "sum_142",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_702",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "convert_element_type_1834",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_142",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "convert_element_type_1835",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1276",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1834",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "add_325",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1835",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "dtype_cast_503",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_503",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.ffn_norm",
-      "name": "alias_default_1502",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_325",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "alias_default_1285",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1285",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_240",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "einsum_default_555",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_241",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "permute_1107",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1285",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1107",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "einsum_default_556",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_555",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "permute_1108",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1108",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "dtype_cast_504",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_504",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wo",
-      "name": "alias_default_1497",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_556",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_1364",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1364",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "permute_1109",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1109",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_236",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_237",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_238",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_239",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_73",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_78",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_79",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_23",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_23",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.sdpa",
-      "name": "getitem_357",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_23",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.sdpa",
-      "name": "getitem_358",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_23",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.sdpa",
-      "name": "getitem_359",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_359",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "permute_1110",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_358",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "permute_1111",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_357",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "permute_1112",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1110",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_1365",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1365",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "sum_143",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_143",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "squeeze_46",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1111",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_1366",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1366",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "sum_144",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_144",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "squeeze_47",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_47",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "convert_element_type_1840",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1112",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "convert_element_type_1841",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1840",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_1367",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1367",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_as_complex_110",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_235",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "_conj_46",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_46",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "clone_254",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_110",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_254",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "mul_704",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1841",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_1368",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1368",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_as_complex_111",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_235",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "_conj_47",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_47",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "clone_255",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_111",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_255",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "mul_705",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_704",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_as_real_110",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_110",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_1369",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1369",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "convert_element_type_1842",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_705",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_as_real_111",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_111",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_1370",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1370",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "convert_element_type_1843",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_46",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_1371",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1842",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_1372",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1843",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "view_1373",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1371",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "alias_default_1286",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1286",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_231",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wv",
-      "name": "einsum_default_557",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_234",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wv",
-      "name": "permute_1115",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1286",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1115",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wv",
-      "name": "einsum_default_558",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_557",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wv",
-      "name": "permute_1116",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1116",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wv",
-      "name": "dtype_cast_505",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_505",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wv",
-      "name": "alias_default_1496",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1372",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "alias_default_1287",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1287",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_231",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wk",
-      "name": "einsum_default_559",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_233",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wk",
-      "name": "permute_1119",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1287",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1119",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wk",
-      "name": "einsum_default_560",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_558",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_560",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "add_326",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_559",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wk",
-      "name": "permute_1120",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1120",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wk",
-      "name": "dtype_cast_506",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_506",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wk",
-      "name": "alias_default_1495",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1373",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention",
-      "name": "alias_default_1288",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1288",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_231",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wq",
-      "name": "einsum_default_561",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_232",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wq",
-      "name": "permute_1123",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1288",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1123",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wq",
-      "name": "einsum_default_562",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_326",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_562",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8",
-      "name": "add_327",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_561",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wq",
-      "name": "permute_1124",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1124",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wq",
-      "name": "dtype_cast_507",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_507",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention.wq",
-      "name": "alias_default_1494",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_327",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "convert_element_type_1856",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_227",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "convert_element_type_1857",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_228",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "convert_element_type_1858",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1856",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "alias_default_1289",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1289",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1858",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "mul_706",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1857",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_230",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "mul_707",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_706",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "alias_default_1290",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_707",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "alias_default_1291",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1291",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1290",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "mul_708",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_708",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "sum_145",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1291",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "div_80",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_80",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_145",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "mul_709",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1290",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_709",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "sub_72",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_72",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_230",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "mul_710",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1289",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1291",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "mul_711",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_711",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "sum_146",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_710",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "convert_element_type_1859",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_146",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "convert_element_type_1860",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1285",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1859",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "add_328",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1860",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "dtype_cast_508",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_508",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.8.attention_norm",
-      "name": "alias_default_1501",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_328",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "alias_default_1292",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1292",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_225",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "einsum_default_563",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_226",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "permute_1127",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1292",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1127",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "einsum_default_564",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_563",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "permute_1128",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1128",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "dtype_cast_509",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_509",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "alias_default_1490",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_564",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w2",
-      "name": "alias_default_1293",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1293",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_222",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "mul_712",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1293",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_224",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "mul_713",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_712",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "alias_default_1294",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1294",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_218",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "einsum_default_565",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_223",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "permute_1131",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1294",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1131",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "einsum_default_566",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_565",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "permute_1132",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1132",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "dtype_cast_510",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_510",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w3",
-      "name": "alias_default_1491",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_713",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "convert_element_type_1869",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_220",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "convert_element_type_1870",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1870",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "alias_default_1295",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1295",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "neg_56",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "exp_56",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "add_329",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_329",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "reciprocal_24",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "mul_714",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_714",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "alias_default_1296",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1869",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1296",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "mul_715",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1296",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "sub_73",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1295",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_73",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "mul_716",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_716",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "add_330",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_715",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_330",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "mul_717",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_717",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "convert_element_type_1871",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1871",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward",
-      "name": "alias_default_1297",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1297",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_218",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "einsum_default_567",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_219",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "permute_1135",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1297",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1135",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "einsum_default_568",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_566",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_568",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7",
-      "name": "add_331",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_567",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "permute_1136",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1136",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "dtype_cast_511",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_511",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.feed_forward.w1",
-      "name": "alias_default_1489",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_331",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "convert_element_type_1876",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_214",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "convert_element_type_1877",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_215",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "convert_element_type_1878",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1876",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "alias_default_1298",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1298",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1878",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "mul_718",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1877",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_217",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "mul_719",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_718",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "alias_default_1299",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_719",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "alias_default_1300",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1300",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1299",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "mul_720",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_720",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "sum_147",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1300",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "div_81",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_81",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_147",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "mul_721",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1299",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_721",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "sub_74",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_74",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_217",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "mul_722",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1298",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1300",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "mul_723",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_723",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "sum_148",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_722",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "convert_element_type_1879",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_148",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "convert_element_type_1880",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1292",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1879",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "add_332",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1880",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "dtype_cast_512",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_512",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.ffn_norm",
-      "name": "alias_default_1493",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_332",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "alias_default_1301",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1301",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_212",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "einsum_default_569",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_213",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "permute_1139",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1301",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1139",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "einsum_default_570",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_569",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "permute_1140",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1140",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "dtype_cast_513",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_513",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wo",
-      "name": "alias_default_1488",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_570",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_1388",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1388",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "permute_1141",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1141",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_208",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_209",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_210",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_211",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_64",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_69",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_70",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_24",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.sdpa",
-      "name": "getitem_360",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.sdpa",
-      "name": "getitem_361",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_24",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.sdpa",
-      "name": "getitem_362",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_362",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "permute_1142",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_361",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "permute_1143",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_360",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "permute_1144",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1142",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_1389",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1389",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "sum_149",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_149",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "squeeze_48",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1143",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_1390",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1390",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "sum_150",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_150",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "squeeze_49",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_49",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "convert_element_type_1885",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1144",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "convert_element_type_1886",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1885",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_1391",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1391",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_as_complex_112",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_207",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "_conj_48",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_48",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "clone_262",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_112",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_262",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "mul_724",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1886",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_1392",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1392",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_as_complex_113",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_207",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "_conj_49",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_49",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "clone_263",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_113",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_263",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "mul_725",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_724",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_as_real_112",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_112",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_1393",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1393",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "convert_element_type_1887",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_725",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_as_real_113",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_113",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_1394",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1394",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "convert_element_type_1888",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_48",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_1395",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1887",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_1396",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1888",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "view_1397",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1395",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "alias_default_1302",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1302",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_203",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wv",
-      "name": "einsum_default_571",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_206",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wv",
-      "name": "permute_1147",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1302",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1147",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wv",
-      "name": "einsum_default_572",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_571",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wv",
-      "name": "permute_1148",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1148",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wv",
-      "name": "dtype_cast_514",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_514",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wv",
-      "name": "alias_default_1487",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1396",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "alias_default_1303",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1303",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_203",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wk",
-      "name": "einsum_default_573",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_205",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wk",
-      "name": "permute_1151",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1303",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1151",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wk",
-      "name": "einsum_default_574",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_572",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_574",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "add_333",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_573",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wk",
-      "name": "permute_1152",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1152",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wk",
-      "name": "dtype_cast_515",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_515",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wk",
-      "name": "alias_default_1486",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1397",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention",
-      "name": "alias_default_1304",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1304",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_203",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wq",
-      "name": "einsum_default_575",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_204",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wq",
-      "name": "permute_1155",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1304",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1155",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wq",
-      "name": "einsum_default_576",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_333",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_576",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7",
-      "name": "add_334",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_575",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wq",
-      "name": "permute_1156",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1156",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wq",
-      "name": "dtype_cast_516",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_516",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention.wq",
-      "name": "alias_default_1485",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_334",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "convert_element_type_1901",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_199",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "convert_element_type_1902",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_200",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "convert_element_type_1903",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1901",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "alias_default_1305",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1305",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1903",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "mul_726",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1902",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_202",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "mul_727",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_726",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "alias_default_1306",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_727",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "alias_default_1307",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1307",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1306",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "mul_728",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_728",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "sum_151",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1307",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "div_82",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_82",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_151",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "mul_729",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1306",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_729",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "sub_75",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_75",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_202",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "mul_730",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1305",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1307",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "mul_731",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_731",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "sum_152",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_730",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "convert_element_type_1904",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_152",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "convert_element_type_1905",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1301",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1904",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "add_335",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1905",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "dtype_cast_517",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_517",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.7.attention_norm",
-      "name": "alias_default_1492",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_335",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "alias_default_1308",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1308",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_197",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "einsum_default_577",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_198",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "permute_1159",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1308",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1159",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "einsum_default_578",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_577",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "permute_1160",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1160",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "dtype_cast_518",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_518",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "alias_default_1481",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_578",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w2",
-      "name": "alias_default_1309",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1309",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_194",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "mul_732",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1309",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_196",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "mul_733",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_732",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "alias_default_1310",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1310",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_190",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "einsum_default_579",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_195",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "permute_1163",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1310",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1163",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "einsum_default_580",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_579",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "permute_1164",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1164",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "dtype_cast_519",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_519",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w3",
-      "name": "alias_default_1482",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_733",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "convert_element_type_1914",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_192",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "convert_element_type_1915",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1915",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "alias_default_1311",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1311",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "neg_57",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "exp_57",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "add_336",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_336",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "reciprocal_25",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_25",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "mul_734",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_734",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "alias_default_1312",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1914",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1312",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "mul_735",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1312",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "sub_76",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1311",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_76",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "mul_736",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_736",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "add_337",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_735",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_337",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "mul_737",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_737",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "convert_element_type_1916",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1916",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward",
-      "name": "alias_default_1313",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1313",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_190",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "einsum_default_581",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_191",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "permute_1167",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1313",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1167",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "einsum_default_582",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_580",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_582",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6",
-      "name": "add_338",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_581",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "permute_1168",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1168",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "dtype_cast_520",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_520",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.feed_forward.w1",
-      "name": "alias_default_1480",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_338",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "convert_element_type_1921",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_186",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "convert_element_type_1922",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_187",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "convert_element_type_1923",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1921",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "alias_default_1314",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1314",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1923",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "mul_738",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1922",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_189",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "mul_739",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_738",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "alias_default_1315",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_739",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "alias_default_1316",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1316",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1315",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "mul_740",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_740",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "sum_153",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1316",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "div_83",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_83",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_153",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "mul_741",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1315",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_741",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "sub_77",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_77",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_189",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "mul_742",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1314",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1316",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "mul_743",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_743",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "sum_154",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_742",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "convert_element_type_1924",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_154",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "convert_element_type_1925",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1308",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1924",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "add_339",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1925",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "dtype_cast_521",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_521",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.ffn_norm",
-      "name": "alias_default_1484",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_339",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "alias_default_1317",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1317",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_184",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "einsum_default_583",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_185",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "permute_1171",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1317",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1171",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "einsum_default_584",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_583",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "permute_1172",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1172",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "dtype_cast_522",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_522",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wo",
-      "name": "alias_default_1479",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_584",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_1412",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1412",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "permute_1173",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1173",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_180",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_181",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_182",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_183",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_55",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_60",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_61",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_25",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.sdpa",
-      "name": "getitem_363",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.sdpa",
-      "name": "getitem_364",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_25",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.sdpa",
-      "name": "getitem_365",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_365",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "permute_1174",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_364",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "permute_1175",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_363",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "permute_1176",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1174",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_1413",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1413",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "sum_155",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_155",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "squeeze_50",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1175",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_1414",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1414",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "sum_156",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_156",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "squeeze_51",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_51",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "convert_element_type_1930",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1176",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "convert_element_type_1931",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1930",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_1415",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1415",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_as_complex_114",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_179",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "_conj_50",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_50",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "clone_270",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_114",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_270",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "mul_744",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1931",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_1416",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1416",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_as_complex_115",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_179",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "_conj_51",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_51",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "clone_271",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_115",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_271",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "mul_745",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_744",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_as_real_114",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_114",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_1417",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1417",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "convert_element_type_1932",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_745",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_as_real_115",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_115",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_1418",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1418",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "convert_element_type_1933",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_50",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_1419",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1932",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_1420",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1933",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "view_1421",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1419",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "alias_default_1318",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1318",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_175",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wv",
-      "name": "einsum_default_585",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_178",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wv",
-      "name": "permute_1179",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1318",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1179",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wv",
-      "name": "einsum_default_586",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_585",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wv",
-      "name": "permute_1180",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1180",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wv",
-      "name": "dtype_cast_523",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_523",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wv",
-      "name": "alias_default_1478",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1420",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "alias_default_1319",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1319",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_175",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wk",
-      "name": "einsum_default_587",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_177",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wk",
-      "name": "permute_1183",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1319",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1183",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wk",
-      "name": "einsum_default_588",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_586",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_588",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "add_340",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_587",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wk",
-      "name": "permute_1184",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1184",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wk",
-      "name": "dtype_cast_524",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_524",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wk",
-      "name": "alias_default_1477",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1421",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention",
-      "name": "alias_default_1320",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1320",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_175",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wq",
-      "name": "einsum_default_589",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_176",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wq",
-      "name": "permute_1187",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1320",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1187",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wq",
-      "name": "einsum_default_590",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_340",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_590",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6",
-      "name": "add_341",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_589",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wq",
-      "name": "permute_1188",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1188",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wq",
-      "name": "dtype_cast_525",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_525",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention.wq",
-      "name": "alias_default_1476",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_341",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "convert_element_type_1946",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "convert_element_type_1947",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_172",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "convert_element_type_1948",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1946",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "alias_default_1321",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1321",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1948",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "mul_746",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1947",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_174",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "mul_747",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_746",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "alias_default_1322",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_747",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "alias_default_1323",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1323",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1322",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "mul_748",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_748",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "sum_157",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1323",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "div_84",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_84",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_157",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "mul_749",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1322",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_749",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "sub_78",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_78",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_174",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "mul_750",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1321",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1323",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "mul_751",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_751",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "sum_158",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_750",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "convert_element_type_1949",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_158",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "convert_element_type_1950",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1317",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1949",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "add_342",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1950",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "dtype_cast_526",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_526",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.6.attention_norm",
-      "name": "alias_default_1483",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_342",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "alias_default_1324",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1324",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_169",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "einsum_default_591",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_170",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "permute_1191",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1324",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1191",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "einsum_default_592",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_591",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "permute_1192",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1192",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "dtype_cast_527",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_527",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "alias_default_1472",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_592",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w2",
-      "name": "alias_default_1325",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1325",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_166",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "mul_752",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1325",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_168",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "mul_753",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_752",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "alias_default_1326",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1326",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_162",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "einsum_default_593",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_167",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "permute_1195",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1326",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1195",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "einsum_default_594",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_593",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "permute_1196",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1196",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "dtype_cast_528",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_528",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w3",
-      "name": "alias_default_1473",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_753",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "convert_element_type_1959",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_164",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "convert_element_type_1960",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1960",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "alias_default_1327",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1327",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "neg_58",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "exp_58",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "add_343",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_343",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "reciprocal_26",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "mul_754",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_754",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "alias_default_1328",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1959",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1328",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "mul_755",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1328",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "sub_79",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1327",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_79",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "mul_756",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_756",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "add_344",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_755",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_344",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "mul_757",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_757",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "convert_element_type_1961",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1961",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward",
-      "name": "alias_default_1329",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1329",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_162",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "einsum_default_595",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_163",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "permute_1199",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1329",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1199",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "einsum_default_596",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_594",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_596",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5",
-      "name": "add_345",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_595",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "permute_1200",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1200",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "dtype_cast_529",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_529",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.feed_forward.w1",
-      "name": "alias_default_1471",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_345",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "convert_element_type_1966",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_158",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "convert_element_type_1967",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_159",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "convert_element_type_1968",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1966",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "alias_default_1330",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1330",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1968",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "mul_758",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1967",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_161",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "mul_759",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_758",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "alias_default_1331",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_759",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "alias_default_1332",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1332",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1331",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "mul_760",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_760",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "sum_159",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1332",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "div_85",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_85",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_159",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "mul_761",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1331",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_761",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "sub_80",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_80",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_161",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "mul_762",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1330",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1332",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "mul_763",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_763",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "sum_160",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_762",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "convert_element_type_1969",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_160",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "convert_element_type_1970",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1324",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1969",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "add_346",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1970",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "dtype_cast_530",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_530",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.ffn_norm",
-      "name": "alias_default_1475",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_346",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "alias_default_1333",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1333",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_156",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "einsum_default_597",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_157",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "permute_1203",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1333",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1203",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "einsum_default_598",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_597",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "permute_1204",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1204",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "dtype_cast_531",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_531",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wo",
-      "name": "alias_default_1470",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_598",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_1436",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1436",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "permute_1205",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1205",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_152",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_153",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_154",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_155",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_46",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_51",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_52",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_26",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.sdpa",
-      "name": "getitem_366",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.sdpa",
-      "name": "getitem_367",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_26",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.sdpa",
-      "name": "getitem_368",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_368",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "permute_1206",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_367",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "permute_1207",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_366",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "permute_1208",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1206",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_1437",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1437",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "sum_161",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_161",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "squeeze_52",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1207",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_1438",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1438",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "sum_162",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_162",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "squeeze_53",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_53",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "convert_element_type_1975",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1208",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "convert_element_type_1976",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1975",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_1439",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1439",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_as_complex_116",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_151",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "_conj_52",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_52",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "clone_278",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_116",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_278",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "mul_764",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1976",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_1440",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1440",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_as_complex_117",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_151",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "_conj_53",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_53",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "clone_279",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_117",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_279",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "mul_765",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_764",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_as_real_116",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_116",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_1441",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1441",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "convert_element_type_1977",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_765",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_as_real_117",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_117",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_1442",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1442",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "convert_element_type_1978",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_1443",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1977",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_1444",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_1978",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "view_1445",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1443",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "alias_default_1334",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1334",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_147",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wv",
-      "name": "einsum_default_599",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_150",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wv",
-      "name": "permute_1211",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1334",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1211",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wv",
-      "name": "einsum_default_600",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_599",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wv",
-      "name": "permute_1212",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1212",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wv",
-      "name": "dtype_cast_532",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_532",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wv",
-      "name": "alias_default_1469",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1444",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "alias_default_1335",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1335",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_147",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wk",
-      "name": "einsum_default_601",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_149",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wk",
-      "name": "permute_1215",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1335",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1215",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wk",
-      "name": "einsum_default_602",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_600",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_602",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "add_347",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_601",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wk",
-      "name": "permute_1216",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1216",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wk",
-      "name": "dtype_cast_533",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_533",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wk",
-      "name": "alias_default_1468",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1445",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention",
-      "name": "alias_default_1336",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1336",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_147",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wq",
-      "name": "einsum_default_603",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_148",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wq",
-      "name": "permute_1219",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1336",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1219",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wq",
-      "name": "einsum_default_604",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_347",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_604",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5",
-      "name": "add_348",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_603",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wq",
-      "name": "permute_1220",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1220",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wq",
-      "name": "dtype_cast_534",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_534",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention.wq",
-      "name": "alias_default_1467",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_348",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "convert_element_type_1991",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_143",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "convert_element_type_1992",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_144",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "convert_element_type_1993",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1991",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "alias_default_1337",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1337",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_1993",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "mul_766",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1992",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_146",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "mul_767",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_766",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "alias_default_1338",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_767",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "alias_default_1339",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1339",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1338",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "mul_768",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_768",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "sum_163",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1339",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "div_86",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_86",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_163",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "mul_769",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1338",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_769",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "sub_81",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_81",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_146",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "mul_770",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1337",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1339",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "mul_771",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_771",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "sum_164",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_770",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "convert_element_type_1994",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_164",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "convert_element_type_1995",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1333",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_1994",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "add_349",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_1995",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "dtype_cast_535",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_535",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.5.attention_norm",
-      "name": "alias_default_1474",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_349",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "alias_default_1340",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1340",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_141",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "einsum_default_605",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_142",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "permute_1223",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1340",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1223",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "einsum_default_606",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_605",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "permute_1224",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1224",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "dtype_cast_536",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_536",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "alias_default_1463",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_606",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w2",
-      "name": "alias_default_1341",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1341",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_138",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "mul_772",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1341",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_140",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "mul_773",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_772",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "alias_default_1342",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1342",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_134",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "einsum_default_607",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_139",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "permute_1227",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1342",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1227",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "einsum_default_608",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_607",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "permute_1228",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1228",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "dtype_cast_537",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_537",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w3",
-      "name": "alias_default_1464",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_773",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "convert_element_type_2004",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_136",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "convert_element_type_2005",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2005",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "alias_default_1343",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1343",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "neg_59",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "exp_59",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "add_350",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_350",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "reciprocal_27",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_27",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "mul_774",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_774",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "alias_default_1344",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2004",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1344",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "mul_775",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1344",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "sub_82",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1343",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_82",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "mul_776",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_776",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "add_351",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_775",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_351",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "mul_777",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_777",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "convert_element_type_2006",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2006",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward",
-      "name": "alias_default_1345",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1345",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_134",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "einsum_default_609",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_135",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "permute_1231",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1345",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1231",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "einsum_default_610",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_608",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_610",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4",
-      "name": "add_352",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_609",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "permute_1232",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1232",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "dtype_cast_538",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_538",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.feed_forward.w1",
-      "name": "alias_default_1462",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_352",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "convert_element_type_2011",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_130",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "convert_element_type_2012",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_131",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "convert_element_type_2013",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2011",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "alias_default_1346",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1346",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_2013",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "mul_778",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2012",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_133",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "mul_779",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_778",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "alias_default_1347",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_779",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "alias_default_1348",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1348",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1347",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "mul_780",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_780",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "sum_165",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1348",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "div_87",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_87",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_165",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "mul_781",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1347",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_781",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "sub_83",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_83",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_133",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "mul_782",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1346",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1348",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "mul_783",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_783",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "sum_166",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_782",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "convert_element_type_2014",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_166",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "convert_element_type_2015",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1340",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2014",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "add_353",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_2015",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "dtype_cast_539",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_539",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.ffn_norm",
-      "name": "alias_default_1466",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_353",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "alias_default_1349",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1349",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_128",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "einsum_default_611",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_129",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "permute_1235",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1349",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1235",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "einsum_default_612",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_611",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "permute_1236",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1236",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "dtype_cast_540",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_540",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wo",
-      "name": "alias_default_1461",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_612",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_1460",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1460",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "permute_1237",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1237",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_124",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_125",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_126",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_127",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_37",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_42",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_43",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_27",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.sdpa",
-      "name": "getitem_369",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.sdpa",
-      "name": "getitem_370",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_27",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.sdpa",
-      "name": "getitem_371",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_371",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "permute_1238",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_370",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "permute_1239",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_369",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "permute_1240",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1238",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_1461",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1461",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "sum_167",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_167",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "squeeze_54",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1239",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_1462",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1462",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "sum_168",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_168",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "squeeze_55",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_55",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "convert_element_type_2020",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1240",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "convert_element_type_2021",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2020",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_1463",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1463",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_as_complex_118",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_123",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "_conj_54",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_54",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "clone_286",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_118",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_286",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "mul_784",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2021",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_1464",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1464",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_as_complex_119",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_123",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "_conj_55",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_55",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "clone_287",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_119",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_287",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "mul_785",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_784",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_as_real_118",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_118",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_1465",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1465",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "convert_element_type_2022",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_785",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_as_real_119",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_119",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_1466",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1466",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "convert_element_type_2023",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_1467",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2022",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_1468",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2023",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "view_1469",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1467",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "alias_default_1350",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1350",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_119",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wv",
-      "name": "einsum_default_613",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_122",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wv",
-      "name": "permute_1243",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1350",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1243",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wv",
-      "name": "einsum_default_614",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_613",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wv",
-      "name": "permute_1244",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1244",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wv",
-      "name": "dtype_cast_541",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_541",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wv",
-      "name": "alias_default_1460",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1468",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "alias_default_1351",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1351",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_119",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wk",
-      "name": "einsum_default_615",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_121",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wk",
-      "name": "permute_1247",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1351",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1247",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wk",
-      "name": "einsum_default_616",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_614",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_616",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "add_354",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_615",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wk",
-      "name": "permute_1248",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1248",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wk",
-      "name": "dtype_cast_542",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_542",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wk",
-      "name": "alias_default_1459",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1469",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention",
-      "name": "alias_default_1352",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1352",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_119",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wq",
-      "name": "einsum_default_617",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_120",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wq",
-      "name": "permute_1251",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1352",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1251",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wq",
-      "name": "einsum_default_618",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_354",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_618",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4",
-      "name": "add_355",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_617",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wq",
-      "name": "permute_1252",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1252",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wq",
-      "name": "dtype_cast_543",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_543",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention.wq",
-      "name": "alias_default_1458",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_355",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "convert_element_type_2036",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_115",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "convert_element_type_2037",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_116",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "convert_element_type_2038",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2036",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "alias_default_1353",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1353",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_2038",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "mul_786",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2037",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_118",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "mul_787",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_786",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "alias_default_1354",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_787",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "alias_default_1355",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1355",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1354",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "mul_788",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_788",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "sum_169",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1355",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "div_88",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_88",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_169",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "mul_789",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1354",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_789",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "sub_84",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_84",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_118",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "mul_790",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1353",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1355",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "mul_791",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_791",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "sum_170",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_790",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "convert_element_type_2039",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_170",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "convert_element_type_2040",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1349",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2039",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "add_356",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_2040",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "dtype_cast_544",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_544",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.4.attention_norm",
-      "name": "alias_default_1465",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_356",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "alias_default_1356",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1356",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_113",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "einsum_default_619",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_114",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "permute_1255",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1356",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1255",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "einsum_default_620",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_619",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "permute_1256",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1256",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "dtype_cast_545",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_545",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "alias_default_1454",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_620",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w2",
-      "name": "alias_default_1357",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1357",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_110",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "mul_792",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1357",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_112",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "mul_793",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_792",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "alias_default_1358",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1358",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_106",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "einsum_default_621",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_111",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "permute_1259",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1358",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1259",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "einsum_default_622",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_621",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "permute_1260",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1260",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "dtype_cast_546",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_546",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w3",
-      "name": "alias_default_1455",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_793",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "convert_element_type_2049",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_108",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "convert_element_type_2050",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2050",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "alias_default_1359",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1359",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "neg_60",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "exp_60",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "add_357",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_357",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "reciprocal_28",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "mul_794",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_794",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "alias_default_1360",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2049",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1360",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "mul_795",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1360",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "sub_85",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1359",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_85",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "mul_796",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_796",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "add_358",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_795",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_358",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "mul_797",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_797",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "convert_element_type_2051",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2051",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward",
-      "name": "alias_default_1361",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1361",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_106",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "einsum_default_623",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_107",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "permute_1263",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1361",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1263",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "einsum_default_624",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_622",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_624",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3",
-      "name": "add_359",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_623",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "permute_1264",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1264",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "dtype_cast_547",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_547",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.feed_forward.w1",
-      "name": "alias_default_1453",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_359",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "convert_element_type_2056",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_102",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "convert_element_type_2057",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_103",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "convert_element_type_2058",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2056",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "alias_default_1362",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1362",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_2058",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "mul_798",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2057",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_105",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "mul_799",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_798",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "alias_default_1363",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_799",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "alias_default_1364",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1364",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1363",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "mul_800",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_800",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "sum_171",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1364",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "div_89",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_89",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "mul_801",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1363",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_801",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "sub_86",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_86",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_105",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "mul_802",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1362",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1364",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "mul_803",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_803",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "sum_172",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_802",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "convert_element_type_2059",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_172",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "convert_element_type_2060",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1356",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2059",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "add_360",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_2060",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "dtype_cast_548",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_548",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.ffn_norm",
-      "name": "alias_default_1457",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_360",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "alias_default_1365",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1365",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_100",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "einsum_default_625",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_101",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "permute_1267",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1365",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1267",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "einsum_default_626",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_625",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "permute_1268",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1268",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "dtype_cast_549",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_549",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wo",
-      "name": "alias_default_1452",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_626",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_1484",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1484",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "permute_1269",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1269",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_96",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_97",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_98",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_99",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_28",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_33",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_34",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_28",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_28",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.sdpa",
-      "name": "getitem_372",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_28",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.sdpa",
-      "name": "getitem_373",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_28",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.sdpa",
-      "name": "getitem_374",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_374",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "permute_1270",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_373",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "permute_1271",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_372",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "permute_1272",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1270",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_1485",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1485",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "sum_173",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_173",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "squeeze_56",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1271",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_1486",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1486",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "sum_174",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_174",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "squeeze_57",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "convert_element_type_2065",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1272",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "convert_element_type_2066",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2065",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_1487",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1487",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_as_complex_120",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_95",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "_conj_56",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_56",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "clone_294",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_120",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_294",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "mul_804",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2066",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_1488",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1488",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_as_complex_121",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_95",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "_conj_57",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_57",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "clone_295",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_121",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_295",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "mul_805",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_804",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_as_real_120",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_120",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_1489",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1489",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "convert_element_type_2067",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_805",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_as_real_121",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_121",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_1490",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1490",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "convert_element_type_2068",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_1491",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2067",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_1492",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2068",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "view_1493",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1491",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "alias_default_1366",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1366",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_91",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wv",
-      "name": "einsum_default_627",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_94",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wv",
-      "name": "permute_1275",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1366",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1275",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wv",
-      "name": "einsum_default_628",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_627",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wv",
-      "name": "permute_1276",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1276",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wv",
-      "name": "dtype_cast_550",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_550",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wv",
-      "name": "alias_default_1451",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1492",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "alias_default_1367",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1367",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_91",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wk",
-      "name": "einsum_default_629",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_93",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wk",
-      "name": "permute_1279",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1367",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1279",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wk",
-      "name": "einsum_default_630",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_628",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_630",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "add_361",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_629",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wk",
-      "name": "permute_1280",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1280",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wk",
-      "name": "dtype_cast_551",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_551",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wk",
-      "name": "alias_default_1450",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1493",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention",
-      "name": "alias_default_1368",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1368",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_91",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wq",
-      "name": "einsum_default_631",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_92",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wq",
-      "name": "permute_1283",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1368",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1283",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wq",
-      "name": "einsum_default_632",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_361",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_632",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3",
-      "name": "add_362",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_631",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wq",
-      "name": "permute_1284",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1284",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wq",
-      "name": "dtype_cast_552",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_552",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention.wq",
-      "name": "alias_default_1449",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_362",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "convert_element_type_2081",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_87",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "convert_element_type_2082",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_88",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "convert_element_type_2083",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2081",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "alias_default_1369",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1369",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_2083",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "mul_806",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2082",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_90",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "mul_807",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_806",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "alias_default_1370",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_807",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "alias_default_1371",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1371",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1370",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "mul_808",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_808",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "sum_175",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1371",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "div_90",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_90",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_175",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "mul_809",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1370",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_809",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "sub_87",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_87",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_90",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "mul_810",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1369",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1371",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "mul_811",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_811",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "sum_176",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_810",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "convert_element_type_2084",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_176",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "convert_element_type_2085",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1365",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2084",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "add_363",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_2085",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "dtype_cast_553",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_553",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.3.attention_norm",
-      "name": "alias_default_1456",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_363",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "alias_default_1372",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1372",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_85",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "einsum_default_633",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "cluster_root": "permute_1319",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_86",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "permute_1287",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "cluster_root": "einsum_default_648",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1372",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1287",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "einsum_default_634",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_633",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "permute_1288",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1288",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "dtype_cast_554",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_554",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "alias_default_1445",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "cluster_root": "alias_default_1389",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_634",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w2",
-      "name": "alias_default_1373",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "cluster_root": "mul_832",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1373",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_82",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "mul_812",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "cluster_root": "mul_833",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1373",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_84",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "mul_813",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "cluster_root": "alias_default_1390",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_812",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "alias_default_1374",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1374",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_78",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "einsum_default_635",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "cluster_root": "permute_1323",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_83",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "permute_1291",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "cluster_root": "einsum_default_650",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1374",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1291",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "einsum_default_636",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_635",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "permute_1292",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1292",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "dtype_cast_555",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_555",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w3",
-      "name": "alias_default_1446",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "cluster_root": "convert_element_type_2139",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_813",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "convert_element_type_2094",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "cluster_root": "convert_element_type_2140",
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_80",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "convert_element_type_2095",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "cluster_root": "alias_default_1391",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2095",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "alias_default_1375",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "cluster_root": "neg_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1375",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "neg_61",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "cluster_root": "exp_62",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "exp_61",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "cluster_root": "add_371",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "add_364",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "cluster_root": "reciprocal_30",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_364",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "reciprocal_29",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "cluster_root": "mul_834",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "mul_814",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "cluster_root": "alias_default_1392",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_814",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "alias_default_1376",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "cluster_root": "mul_835",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2094",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1376",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "mul_815",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "cluster_root": "sub_91",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1376",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "sub_88",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "cluster_root": "mul_836",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1375",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_88",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "mul_816",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "cluster_root": "add_372",
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_816",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "add_365",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "cluster_root": "mul_837",
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_815",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_365",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "mul_817",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "cluster_root": "convert_element_type_2141",
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_817",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "convert_element_type_2096",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "cluster_root": "alias_default_1393",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2096",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward",
-      "name": "alias_default_1377",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1377",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_78",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "einsum_default_637",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "cluster_root": "permute_1327",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_79",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "permute_1295",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "cluster_root": "einsum_default_652",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1377",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1295",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "einsum_default_638",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_636",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_638",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2",
-      "name": "add_366",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_637",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "permute_1296",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1296",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "dtype_cast_556",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_556",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.feed_forward.w1",
-      "name": "alias_default_1444",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "cluster_root": "convert_element_type_2146",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_366",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "convert_element_type_2101",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "cluster_root": "convert_element_type_2147",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_74",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "convert_element_type_2102",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "cluster_root": "convert_element_type_2148",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_75",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "convert_element_type_2103",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "cluster_root": "alias_default_1394",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2101",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "alias_default_1378",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "cluster_root": "mul_838",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1378",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_2103",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "mul_818",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "cluster_root": "mul_839",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2102",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_77",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "mul_819",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "cluster_root": "alias_default_1395",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_818",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "alias_default_1379",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "cluster_root": "alias_default_1396",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_819",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "alias_default_1380",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "cluster_root": "mul_840",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1380",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1379",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "mul_820",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "cluster_root": "sum_183",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_820",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "sum_177",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "cluster_root": "div_93",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1380",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "div_91",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "cluster_root": "mul_841",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_91",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_177",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "mul_821",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "cluster_root": "sub_92",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1379",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_821",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "sub_89",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "cluster_root": "mul_842",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_89",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_77",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "mul_822",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1378",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1380",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "mul_823",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_823",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "sum_178",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "cluster_root": "convert_element_type_2149",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_822",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "convert_element_type_2104",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_178",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "convert_element_type_2105",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1372",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2104",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "add_367",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_2105",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "dtype_cast_557",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_557",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.ffn_norm",
-      "name": "alias_default_1448",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_367",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "alias_default_1381",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1381",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_72",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "einsum_default_639",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "cluster_root": "permute_1331",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_73",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "permute_1299",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "cluster_root": "einsum_default_654",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1381",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1299",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "einsum_default_640",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_639",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "permute_1300",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1300",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "dtype_cast_558",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_558",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wo",
-      "name": "alias_default_1443",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "cluster_root": "view_1532",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_640",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_1508",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "cluster_root": "permute_1333",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1508",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "permute_1301",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "cluster_root": "_scaled_dot_product_flash_attention_backward_30",
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1301",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_68",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_69",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_70",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_71",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_19",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_24",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_25",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_29",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "cluster_root": "getitem_378",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.sdpa",
-      "name": "getitem_375",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "cluster_root": "getitem_379",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.sdpa",
-      "name": "getitem_376",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "cluster_root": "getitem_380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_29",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.sdpa",
-      "name": "getitem_377",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "cluster_root": "permute_1334",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_377",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "permute_1302",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "cluster_root": "permute_1335",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_376",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "permute_1303",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "cluster_root": "permute_1336",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_375",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "permute_1304",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "cluster_root": "view_1533",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1302",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_1509",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "cluster_root": "sum_185",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1509",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "sum_179",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "cluster_root": "squeeze_60",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_179",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "squeeze_58",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "cluster_root": "view_1534",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1303",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_1510",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "cluster_root": "sum_186",
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1510",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "sum_180",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "cluster_root": "squeeze_61",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_180",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "squeeze_59",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "cluster_root": "convert_element_type_2155",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_59",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "convert_element_type_2110",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "cluster_root": "convert_element_type_2156",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1304",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "convert_element_type_2111",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "cluster_root": "view_1535",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2110",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_1511",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "cluster_root": "view_as_complex_124",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1511",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_as_complex_122",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "cluster_root": "_conj_60",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_67",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "_conj_58",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "cluster_root": "clone_310",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_58",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "clone_302",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "cluster_root": "mul_844",
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_122",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_302",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "mul_824",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "cluster_root": "view_1536",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2111",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_1512",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "cluster_root": "view_as_complex_125",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1512",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_as_complex_123",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "cluster_root": "_conj_61",
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_67",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "_conj_59",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "cluster_root": "clone_311",
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_59",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "clone_303",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "cluster_root": "mul_845",
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_123",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_303",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "mul_825",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "cluster_root": "view_as_real_124",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_824",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_as_real_122",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "cluster_root": "view_1537",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_122",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_1513",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "cluster_root": "convert_element_type_2157",
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1513",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "convert_element_type_2112",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "cluster_root": "view_as_real_125",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_825",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_as_real_123",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "cluster_root": "view_1538",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_123",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_1514",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "cluster_root": "convert_element_type_2158",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1514",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "convert_element_type_2113",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "cluster_root": "view_1539",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_58",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_1515",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "cluster_root": "view_1540",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2112",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_1516",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "cluster_root": "view_1541",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2113",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "view_1517",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "cluster_root": "alias_default_1398",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1515",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "alias_default_1382",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1382",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_63",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wv",
-      "name": "einsum_default_641",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "cluster_root": "permute_1339",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_66",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wv",
-      "name": "permute_1307",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "cluster_root": "einsum_default_656",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1382",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1307",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wv",
-      "name": "einsum_default_642",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_641",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wv",
-      "name": "permute_1308",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1308",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wv",
-      "name": "dtype_cast_559",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_559",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wv",
-      "name": "alias_default_1442",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "cluster_root": "alias_default_1399",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1516",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "alias_default_1383",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1383",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_63",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wk",
-      "name": "einsum_default_643",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "cluster_root": "permute_1343",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_65",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wk",
-      "name": "permute_1311",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "cluster_root": "einsum_default_658",
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1383",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1311",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wk",
-      "name": "einsum_default_644",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_642",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_644",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "add_368",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_643",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wk",
-      "name": "permute_1312",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1312",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wk",
-      "name": "dtype_cast_560",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_560",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wk",
-      "name": "alias_default_1441",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "cluster_root": "alias_default_1400",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1517",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention",
-      "name": "alias_default_1384",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1384",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_63",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wq",
-      "name": "einsum_default_645",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "cluster_root": "permute_1347",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_64",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wq",
-      "name": "permute_1315",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "cluster_root": "einsum_default_660",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1384",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1315",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wq",
-      "name": "einsum_default_646",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_368",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_646",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2",
-      "name": "add_369",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_645",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wq",
-      "name": "permute_1316",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1316",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wq",
-      "name": "dtype_cast_561",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_561",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention.wq",
-      "name": "alias_default_1440",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "cluster_root": "convert_element_type_2171",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_369",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "convert_element_type_2126",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "cluster_root": "convert_element_type_2172",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_59",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "convert_element_type_2127",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "cluster_root": "convert_element_type_2173",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_60",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "convert_element_type_2128",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "cluster_root": "alias_default_1401",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2126",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "alias_default_1385",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "cluster_root": "mul_846",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1385",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_2128",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "mul_826",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "cluster_root": "mul_847",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2127",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_62",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "mul_827",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "cluster_root": "alias_default_1402",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_826",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "alias_default_1386",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "cluster_root": "alias_default_1403",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_827",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "alias_default_1387",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "cluster_root": "mul_848",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1387",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1386",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "mul_828",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "cluster_root": "sum_187",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_828",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "sum_181",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "cluster_root": "div_94",
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1387",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "div_92",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "cluster_root": "mul_849",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_92",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_181",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "mul_829",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "cluster_root": "sub_93",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1386",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_829",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "sub_90",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "cluster_root": "mul_850",
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_90",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_62",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "mul_830",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1385",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1387",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "mul_831",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_831",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "sum_182",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "cluster_root": "convert_element_type_2174",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_830",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "convert_element_type_2129",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_182",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "convert_element_type_2130",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1381",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2129",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "add_370",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_2130",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "dtype_cast_562",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_562",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.2.attention_norm",
-      "name": "alias_default_1447",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_370",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "alias_default_1388",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "cluster_root": "einsum_default_661",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1388",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_57",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "einsum_default_647",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 113,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_58",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "permute_1319",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 114,
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1388",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1319",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "einsum_default_648",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "cluster_root": "permute_1352",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_647",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "permute_1320",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "cluster_root": "dtype_cast_572",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1320",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "dtype_cast_563",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "cluster_root": "alias_default_1427",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_563",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "alias_default_1436",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 115,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_648",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w2",
-      "name": "alias_default_1389",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 116,
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1389",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_54",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "mul_832",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 117,
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1389",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_56",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "mul_833",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 118,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_832",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "alias_default_1390",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "cluster_root": "einsum_default_663",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1390",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_50",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "einsum_default_649",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 119,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_55",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "permute_1323",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 120,
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1390",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1323",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "einsum_default_650",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "cluster_root": "permute_1356",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_649",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "permute_1324",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "cluster_root": "dtype_cast_573",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1324",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "dtype_cast_564",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "cluster_root": "alias_default_1428",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_564",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w3",
-      "name": "alias_default_1437",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 121,
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_833",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "convert_element_type_2139",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 122,
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_52",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "convert_element_type_2140",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 123,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2140",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "alias_default_1391",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 124,
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1391",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "neg_62",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 125,
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_62",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "exp_62",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 126,
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_62",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "add_371",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 127,
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_371",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "reciprocal_30",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 128,
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_30",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "mul_834",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 129,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_834",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "alias_default_1392",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 130,
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2139",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1392",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "mul_835",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 131,
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1392",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "sub_91",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 132,
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1391",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_91",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "mul_836",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 133,
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_836",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "add_372",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 134,
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_835",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_372",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "mul_837",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 135,
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_837",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "convert_element_type_2141",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 136,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2141",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward",
-      "name": "alias_default_1393",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "cluster_root": "einsum_default_665",
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1393",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_50",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "einsum_default_651",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 137,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_51",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "permute_1327",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 138,
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1393",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1327",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "einsum_default_652",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 139,
-      "cluster_root": "add_163",
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_650",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_652",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1",
-      "name": "add_373",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "cluster_root": "permute_1360",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_651",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "permute_1328",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "cluster_root": "dtype_cast_574",
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1328",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "dtype_cast_565",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "cluster_root": "alias_default_1426",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_565",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.feed_forward.w1",
-      "name": "alias_default_1435",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 140,
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_373",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "convert_element_type_2146",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 141,
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_46",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "convert_element_type_2147",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 142,
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_47",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "convert_element_type_2148",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 143,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2146",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "alias_default_1394",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 144,
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1394",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_2148",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "mul_838",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 145,
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2147",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "mul_839",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 146,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_838",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "alias_default_1395",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 147,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_839",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "alias_default_1396",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 148,
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1396",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1395",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "mul_840",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 149,
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_840",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "sum_183",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 150,
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1396",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "div_93",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 151,
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_93",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_183",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "mul_841",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 152,
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1395",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_841",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "sub_92",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 153,
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_92",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_49",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "mul_842",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "cluster_root": "mul_863",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1394",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1396",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "mul_843",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "cluster_root": "sum_190",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_843",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "sum_184",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 154,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_842",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "convert_element_type_2149",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "cluster_root": "convert_element_type_2195",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_184",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "convert_element_type_2150",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 155,
-      "cluster_root": "add_164",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1388",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2149",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "add_374",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "cluster_root": "dtype_cast_575",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_2150",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "dtype_cast_566",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "cluster_root": "alias_default_1430",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_566",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.ffn_norm",
-      "name": "alias_default_1439",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 156,
-      "cluster_root": "alias_default_917",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_374",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "alias_default_1397",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "cluster_root": "einsum_default_667",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1397",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_44",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "einsum_default_653",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 157,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_45",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "permute_1331",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 158,
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1397",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1331",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "einsum_default_654",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "cluster_root": "permute_1364",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_653",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "permute_1332",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "cluster_root": "dtype_cast_576",
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1332",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "dtype_cast_567",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "cluster_root": "alias_default_1425",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_567",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wo",
-      "name": "alias_default_1434",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 159,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_654",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_1532",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 160,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1532",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "permute_1333",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 161,
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1333",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_40",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_41",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_42",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_43",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_10",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_15",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_16",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_30",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 162,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_30",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.sdpa",
-      "name": "getitem_378",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 163,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_30",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.sdpa",
-      "name": "getitem_379",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 164,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_30",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.sdpa",
-      "name": "getitem_380",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 165,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_380",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "permute_1334",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 166,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_379",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "permute_1335",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 167,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_378",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "permute_1336",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 168,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1334",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_1533",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 169,
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1533",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "sum_185",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 170,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_185",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "squeeze_60",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 171,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1335",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_1534",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 172,
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1534",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "sum_186",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 173,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_186",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "squeeze_61",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 174,
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_61",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "convert_element_type_2155",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 175,
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1336",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "convert_element_type_2156",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 176,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2155",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_1535",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 177,
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1535",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_as_complex_124",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 178,
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_39",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "_conj_60",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 179,
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_60",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "clone_310",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 180,
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_124",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_310",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "mul_844",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 181,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2156",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_1536",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 182,
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1536",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_as_complex_125",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 183,
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_39",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "_conj_61",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 184,
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_61",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "clone_311",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 185,
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_125",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_311",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "mul_845",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 186,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_844",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_as_real_124",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 187,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_124",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_1537",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 188,
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1537",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "convert_element_type_2157",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 189,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_845",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_as_real_125",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 190,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_125",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_1538",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 191,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1538",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "convert_element_type_2158",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 192,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_60",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_1539",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 193,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2157",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_1540",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 194,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2158",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "view_1541",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 195,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1539",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "alias_default_1398",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "cluster_root": "einsum_default_669",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1398",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_35",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wv",
-      "name": "einsum_default_655",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 196,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_38",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wv",
-      "name": "permute_1339",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 197,
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1398",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1339",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wv",
-      "name": "einsum_default_656",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "cluster_root": "permute_1372",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_655",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wv",
-      "name": "permute_1340",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "cluster_root": "dtype_cast_577",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1340",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wv",
-      "name": "dtype_cast_568",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "cluster_root": "alias_default_1424",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_568",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wv",
-      "name": "alias_default_1433",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 198,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1540",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "alias_default_1399",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "cluster_root": "einsum_default_671",
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1399",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_35",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wk",
-      "name": "einsum_default_657",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 199,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_37",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wk",
-      "name": "permute_1343",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 200,
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1399",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1343",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wk",
-      "name": "einsum_default_658",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 201,
-      "cluster_root": "add_165",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_656",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_658",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "add_375",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "cluster_root": "permute_1376",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_657",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wk",
-      "name": "permute_1344",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "cluster_root": "dtype_cast_578",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1344",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wk",
-      "name": "dtype_cast_569",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "cluster_root": "alias_default_1423",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_569",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wk",
-      "name": "alias_default_1432",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 202,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1541",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention",
-      "name": "alias_default_1400",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "cluster_root": "einsum_default_673",
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1400",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_35",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wq",
-      "name": "einsum_default_659",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 203,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_36",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wq",
-      "name": "permute_1347",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 204,
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1400",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1347",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wq",
-      "name": "einsum_default_660",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 205,
-      "cluster_root": "add_166",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_375",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_660",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1",
-      "name": "add_376",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "cluster_root": "permute_1380",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_659",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wq",
-      "name": "permute_1348",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "cluster_root": "dtype_cast_579",
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1348",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wq",
-      "name": "dtype_cast_570",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "cluster_root": "alias_default_1422",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_570",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention.wq",
-      "name": "alias_default_1431",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 206,
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_376",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "convert_element_type_2171",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 207,
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "convert_element_type_2172",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 208,
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_32",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "convert_element_type_2173",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 209,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2171",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "alias_default_1401",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 210,
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1401",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_2173",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "mul_846",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 211,
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2172",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_34",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "mul_847",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 212,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_846",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "alias_default_1402",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 213,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_847",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "alias_default_1403",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 214,
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1403",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1402",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "mul_848",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 215,
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_848",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "sum_187",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 216,
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1403",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "div_94",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 217,
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_94",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_187",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "mul_849",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 218,
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1402",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_849",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "sub_93",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 219,
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_93",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_34",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "mul_850",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "cluster_root": "mul_871",
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1401",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1403",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "mul_851",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "cluster_root": "sum_194",
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_851",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "sum_188",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 220,
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_850",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "convert_element_type_2174",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "cluster_root": "convert_element_type_2220",
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_188",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "convert_element_type_2175",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 221,
-      "cluster_root": "add_167",
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1397",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2174",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "add_377",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "cluster_root": "dtype_cast_580",
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_2175",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "dtype_cast_571",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "cluster_root": "alias_default_1429",
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_571",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.1.attention_norm",
-      "name": "alias_default_1438",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 222,
-      "cluster_root": "alias_default_924",
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)R",
-          "name": "add_377",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "alias_default_1404",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)R",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 109,
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1404",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_29",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "einsum_default_661",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "alias_default_30",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "permute_1351",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_1404",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "permute_1351",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "einsum_default_662",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 110,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "einsum_default_661",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "permute_1352",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 111,
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "permute_1352",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "dtype_cast_572",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 112,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(1)",
-          "name": "dtype_cast_572",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "alias_default_1427",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_662",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w2",
-      "name": "alias_default_1405",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1405",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_26",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "mul_852",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1405",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_28",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "mul_853",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_852",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "alias_default_1406",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 223,
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1406",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_22",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "einsum_default_663",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_27",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "permute_1355",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1406",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1355",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "einsum_default_664",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 224,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_663",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "permute_1356",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 225,
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1356",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "dtype_cast_573",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 226,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_573",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w3",
-      "name": "alias_default_1428",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_853",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "convert_element_type_2184",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 136.64587220149252,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_24",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "convert_element_type_2185",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2185",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "alias_default_1407",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1407",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "neg_63",
-      "op": "aten.neg.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "neg_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "exp_63",
-      "op": "aten.exp.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "exp_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "add_378",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_378",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "reciprocal_31",
-      "op": "aten.reciprocal.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "reciprocal_31",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "mul_854",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_854",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "alias_default_1408",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2184",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1408",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "mul_855",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1408",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "sub_94",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1407",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sub_94",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "mul_856",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 182.1944962686567,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_856",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "add_379",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 273.29174440298505,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_855",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "add_379",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "mul_857",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 136.64587220149252,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_857",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "convert_element_type_2186",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2186",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward",
-      "name": "alias_default_1409",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 227,
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1409",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_22",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "einsum_default_665",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        14336
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_23",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "permute_1359",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 694.8379851971689,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1409",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(0)",
-          "name": "permute_1359",
-          "src_placement": "RS(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "einsum_default_666",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 156.16671108742005,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_664",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)P(sum)",
-          "name": "einsum_default_666",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0",
-      "name": "add_380",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)P(sum)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 228,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_665",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "permute_1360",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return self.w2(F.silu(self.w1(x)) * self.w3(x))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 355
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 229,
-      "compute_cost": 34.16146805037313,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1360",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "dtype_cast_574",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 230,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 487.952,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_574",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.feed_forward.w1",
-      "name": "alias_default_1426",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 430.3685785129651,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_380",
-          "src_placement": "S(0)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "convert_element_type_2191",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_18",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "convert_element_type_2192",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_19",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "convert_element_type_2193",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2191",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "alias_default_1410",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1410",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_2193",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "mul_858",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2192",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "mul_859",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_858",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "alias_default_1411",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_859",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "alias_default_1412",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1412",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1411",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "mul_860",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_860",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "sum_189",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1412",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "div_95",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_95",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_189",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "mul_861",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1411",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_861",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "sub_95",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_95",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_21",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "mul_862",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 231,
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1410",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1412",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "mul_863",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 232,
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_863",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "sum_190",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_862",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "convert_element_type_2194",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 233,
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_190",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "convert_element_type_2195",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1404",
-          "src_placement": "S(0)R",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2194",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "add_381",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "cluster_id": 234,
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_2195",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "dtype_cast_575",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 235,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_575",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.ffn_norm",
-      "name": "alias_default_1430",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_381",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "alias_default_1413",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 236,
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1413",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_16",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "einsum_default_667",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_17",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "permute_1363",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1413",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1363",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "einsum_default_668",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 237,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "einsum_default_667",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "permute_1364",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return self.wo(output)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 316
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 238,
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "permute_1364",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "dtype_cast_576",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 239,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 331.9007188940092,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_576",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wo",
-      "name": "alias_default_1425",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(2)",
-          "name": "einsum_default_668",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_1556",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "output = output.view(bs, seqlen, -1)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 315
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1556",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "permute_1365",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "output = output.transpose(",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 312
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 1985.2513862776257,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "permute_1365",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_12",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_13",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_14",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_15",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_1",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_6",
-          "src_placement": "RR",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "getitem_7",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.sdpa",
-      "name": "_scaled_dot_product_flash_attention_backward_31",
-      "op": "aten._scaled_dot_product_flash_attention_backward.default",
-      "phase": "backward",
-      "placement": "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.sdpa",
-      "name": "getitem_381",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.sdpa",
-      "name": "getitem_382",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "_scaled_dot_product_flash_attention_backward_31",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.sdpa",
-      "name": "getitem_383",
-      "op": "<built-in function getitem>",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        32,
-        8192,
-        128
-      ],
-      "source": {
-        "code": "return F.scaled_dot_product_attention(q, k, v, is_causal=True)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 53
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_383",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "permute_1366",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 308
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_382",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "permute_1367",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 307
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "getitem_381",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "permute_1368",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 306
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1366",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_1557",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1557",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "sum_191",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_191",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "squeeze_62",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1367",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_1558",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        4,
-        128
-      ],
-      "source": {
-        "code": ".reshape(bs, slen, n_kv_heads * n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 223
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 16.26736573827292,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1558",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "sum_192",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        1,
-        128
-      ],
-      "source": {
-        "code": ".expand(bs, slen, n_kv_heads, n_rep, head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 222
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "sum_192",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "squeeze_63",
-      "op": "aten.squeeze.dim",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "torch.unsqueeze(x, dim=3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "repeat_kv",
-        "line": 221
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_63",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "convert_element_type_2200",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "permute_1368",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "convert_element_type_2201",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "return xq_out.type_as(xq), xk_out.type_as(xk)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 212
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2200",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_1559",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1559",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_as_complex_126",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_11",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "_conj_62",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_62",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "clone_318",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 14.64062916444563,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_126",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_318",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "mul_864",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64
-      ],
-      "source": {
-        "code": "xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 211
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2201",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_1560",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1560",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_as_complex_127",
-      "op": "aten.view_as_complex.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_11",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "_conj_63",
-      "op": "aten._conj.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 7.0,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "_conj_63",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "clone_319",
-      "op": "aten.clone.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1,
-        8192,
-        1,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 53.68230693630064,
-      "dtype": "complex64",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_complex_127",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "clone_319",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "mul_865",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64
-      ],
-      "source": {
-        "code": "xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 210
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_864",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_as_real_126",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_126",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_1561",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 9.760419442963753,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1561",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "convert_element_type_2202",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        8,
-        128
-      ],
-      "source": {
-        "code": "xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 208
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "mul_865",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_as_real_127",
-      "op": "aten.view_as_real.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        64,
-        2
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_as_real_127",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_1562",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1562",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "convert_element_type_2203",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        32,
-        128
-      ],
-      "source": {
-        "code": "xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "apply_rotary_emb",
-        "line": 207
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "squeeze_62",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_1563",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2202",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_1564",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "convert_element_type_2203",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "view_1565",
-      "op": "aten.view.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1563",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "alias_default_1414",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xv = xv.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 297
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 240,
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1414",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_7",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wv",
-      "name": "einsum_default_669",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_10",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wv",
-      "name": "permute_1371",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1414",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 36.328589861751155,
-          "dst_placement": "RR",
-          "name": "permute_1371",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wv",
-      "name": "einsum_default_670",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "cluster_id": 241,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_669",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wv",
-      "name": "permute_1372",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 242,
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1372",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wv",
-      "name": "dtype_cast_577",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 243,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_577",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wv",
-      "name": "alias_default_1424",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1564",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "alias_default_1415",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        1024
-      ],
-      "source": {
-        "code": "xk = xk.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 296
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 244,
-      "compute_cost": 56.12241179704158,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1415",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_7",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wk",
-      "name": "einsum_default_671",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        1024
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_9",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wk",
-      "name": "permute_1375",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 49.631284656940636,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 86.07528421052632,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1415",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "permute_1375",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wk",
-      "name": "einsum_default_672",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_670",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_672",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "add_382",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 245,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_671",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wk",
-      "name": "permute_1376",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 246,
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1376",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wk",
-      "name": "dtype_cast_578",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 247,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 57.40529711375213,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_578",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wk",
-      "name": "alias_default_1423",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "view_1565",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention",
-      "name": "alias_default_1416",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(2)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq = xq.view(bs, seqlen, -1, self.head_dim)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 295
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 248,
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(2)",
-          "name": "alias_default_1416",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)R",
-          "name": "alias_default_7",
-          "src_placement": "S(0)R",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wq",
-      "name": "einsum_default_673",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "P(sum)S(1)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RS(1)",
-          "name": "alias_default_8",
-          "src_placement": "RS(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wq",
-      "name": "permute_1379",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "RS(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 198.52513862776254,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 190.35670720457864,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1416",
-          "src_placement": "S(0)S(2)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 94.3143594470046,
-          "dst_placement": "RR",
-          "name": "permute_1379",
-          "src_placement": "RS(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wq",
-      "name": "einsum_default_674",
-      "op": "aten.einsum.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 2.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_382",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "einsum_default_674",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0",
-      "name": "add_383",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 249,
-      "compute_cost": 0.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(1)",
-          "name": "einsum_default_673",
-          "src_placement": "P(sum)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wq",
-      "name": "permute_1380",
-      "op": "aten.permute.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 290
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 250,
-      "compute_cost": 9.760419442963753,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)S(0)",
-          "name": "permute_1380",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wq",
-      "name": "dtype_cast_579",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 251,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 160.272,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_579",
-          "src_placement": "P(sum)S(0)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention.wq",
-      "name": "alias_default_1422",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "add_383",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "convert_element_type_2216",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_3",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "convert_element_type_2217",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_4",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "convert_element_type_2218",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "RR",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2216",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "alias_default_1417",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1417",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "convert_element_type_2218",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "mul_866",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2217",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "mul_867",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_866",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "alias_default_1418",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_867",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "alias_default_1419",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1419",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1418",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "mul_868",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_868",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "sum_193",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        1
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.05557036247335,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1419",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "div_96",
-      "op": "aten.div.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "div_96",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sum_193",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "mul_869",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1418",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_869",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "sub_96",
-      "op": "aten.sub.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 52.06192480221486,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "sub_96",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_6",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "mul_870",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 252,
-      "compute_cost": 78.08335554371003,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1417",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1419",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "mul_871",
-      "op": "aten.mul.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 253,
-      "compute_cost": 26.034139620978188,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_871",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "sum_194",
-      "op": "aten.sum.dim_IntList",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "mul_870",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "convert_element_type_2219",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 254,
-      "compute_cost": 7.0,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "sum_194",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "convert_element_type_2220",
-      "op": "prims.convert_element_type.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 39.041677771855014,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "alias_default_1413",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(0)S(1)",
-          "name": "convert_element_type_2219",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "name": "add_384",
-      "op": "aten.add.Tensor",
-      "phase": "backward",
-      "placement": "S(0)S(1)",
-      "shape": [
-        8,
-        8192,
-        4096
-      ],
-      "source": {
-        "code": "return torch.rms_norm(input, normalized_shape, weight, eps)",
-        "file": "/data/users/wangkj/pytorch/torch/nn/functional.py",
-        "func": "rms_norm",
-        "line": 2964
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 255,
-      "compute_cost": 7.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "P(sum)P(sum)",
-          "name": "convert_element_type_2220",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "dtype_cast_580",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "P(sum)P(sum)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "cluster_id": 256,
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 28.41652038284296,
-          "dst_placement": "S(0)S(0)",
-          "name": "dtype_cast_580",
-          "src_placement": "P(sum)P(sum)",
-          "transition_cost": 1
-        }
-      ],
-      "module_path": "L['self'].layers.0.attention_norm",
-      "name": "alias_default_1429",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 38.685829146330285,
-      "dtype": "bfloat16",
-      "inputs": [
-        {
-          "comm_cost": 706.2108351658422,
-          "dst_placement": "S(2)S(2)",
-          "name": "add_384",
-          "src_placement": "S(0)S(1)",
-          "transition_cost": 1
-        },
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "RR",
-          "name": "alias_default_1",
-          "src_placement": "RR",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].tok_embeddings",
-      "name": "embedding_dense_backward",
-      "op": "aten.embedding_dense_backward.default",
-      "phase": "backward",
-      "placement": "S(1)S(1)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "source": {
-        "code": "h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/_testing/models/llama3.py",
-        "func": "forward",
-        "line": 539
-      },
-      "transition_cost": 1.0
-    },
-    {
-      "compute_cost": 76.40578345195063,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(1)S(1)",
-          "name": "embedding_dense_backward",
-          "src_placement": "S(1)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].tok_embeddings",
-      "name": "dtype_cast_581",
-      "op": "autoparallel.dtype_cast.default",
-      "phase": "backward",
-      "placement": "S(1)S(1)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "compute_cost": 0.0,
-      "dtype": "float32",
-      "inputs": [
-        {
-          "comm_cost": 0.0,
-          "dst_placement": "S(1)S(1)",
-          "name": "dtype_cast_581",
-          "src_placement": "S(1)S(1)",
-          "transition_cost": 0
-        }
-      ],
-      "module_path": "L['self'].tok_embeddings",
-      "name": "alias_default_1421",
-      "op": "aten.alias.default",
-      "phase": "backward",
-      "placement": "S(1)S(1)",
-      "shape": [
-        128256,
-        4096
-      ],
-      "source": {
-        "code": "return torch.ops.autoparallel.dtype_cast(_param, _dtype)",
-        "file": "/home/wangkj/workspace/autoparallel/autoparallel/cast_parametrization.py",
-        "func": "getter",
-        "line": 25
-      },
-      "transition_cost": 0.0
-    },
-    {
-      "inputs": [
-        {
-          "name": "alias_default_1420"
-        },
-        {
-          "name": "alias_default_1421"
-        },
-        {
-          "name": "alias_default_1422"
-        },
-        {
-          "name": "alias_default_1423"
-        },
-        {
-          "name": "alias_default_1424"
-        },
-        {
-          "name": "alias_default_1425"
-        },
-        {
-          "name": "alias_default_1426"
-        },
-        {
-          "name": "alias_default_1427"
-        },
-        {
-          "name": "alias_default_1428"
-        },
-        {
-          "name": "alias_default_1429"
-        },
-        {
-          "name": "alias_default_1430"
-        },
-        {
-          "name": "alias_default_1431"
-        },
-        {
-          "name": "alias_default_1432"
-        },
-        {
-          "name": "alias_default_1433"
-        },
-        {
-          "name": "alias_default_1434"
-        },
-        {
-          "name": "alias_default_1435"
-        },
-        {
-          "name": "alias_default_1436"
-        },
-        {
-          "name": "alias_default_1437"
-        },
-        {
-          "name": "alias_default_1438"
-        },
-        {
-          "name": "alias_default_1439"
-        },
-        {
-          "name": "alias_default_1440"
-        },
-        {
-          "name": "alias_default_1441"
-        },
-        {
-          "name": "alias_default_1442"
-        },
-        {
-          "name": "alias_default_1443"
-        },
-        {
-          "name": "alias_default_1444"
-        },
-        {
-          "name": "alias_default_1445"
-        },
-        {
-          "name": "alias_default_1446"
-        },
-        {
-          "name": "alias_default_1447"
-        },
-        {
-          "name": "alias_default_1448"
-        },
-        {
-          "name": "alias_default_1449"
-        },
-        {
-          "name": "alias_default_1450"
-        },
-        {
-          "name": "alias_default_1451"
-        },
-        {
-          "name": "alias_default_1452"
-        },
-        {
-          "name": "alias_default_1453"
-        },
-        {
-          "name": "alias_default_1454"
-        },
-        {
-          "name": "alias_default_1455"
-        },
-        {
-          "name": "alias_default_1456"
-        },
-        {
-          "name": "alias_default_1457"
-        },
-        {
-          "name": "alias_default_1458"
-        },
-        {
-          "name": "alias_default_1459"
-        },
-        {
-          "name": "alias_default_1460"
-        },
-        {
-          "name": "alias_default_1461"
-        },
-        {
-          "name": "alias_default_1462"
-        },
-        {
-          "name": "alias_default_1463"
-        },
-        {
-          "name": "alias_default_1464"
-        },
-        {
-          "name": "alias_default_1465"
-        },
-        {
-          "name": "alias_default_1466"
-        },
-        {
-          "name": "alias_default_1467"
-        },
-        {
-          "name": "alias_default_1468"
-        },
-        {
-          "name": "alias_default_1469"
-        },
-        {
-          "name": "alias_default_1470"
-        },
-        {
-          "name": "alias_default_1471"
-        },
-        {
-          "name": "alias_default_1472"
-        },
-        {
-          "name": "alias_default_1473"
-        },
-        {
-          "name": "alias_default_1474"
-        },
-        {
-          "name": "alias_default_1475"
-        },
-        {
-          "name": "alias_default_1476"
-        },
-        {
-          "name": "alias_default_1477"
-        },
-        {
-          "name": "alias_default_1478"
-        },
-        {
-          "name": "alias_default_1479"
-        },
-        {
-          "name": "alias_default_1480"
-        },
-        {
-          "name": "alias_default_1481"
-        },
-        {
-          "name": "alias_default_1482"
-        },
-        {
-          "name": "alias_default_1483"
-        },
-        {
-          "name": "alias_default_1484"
-        },
-        {
-          "name": "alias_default_1485"
-        },
-        {
-          "name": "alias_default_1486"
-        },
-        {
-          "name": "alias_default_1487"
-        },
-        {
-          "name": "alias_default_1488"
-        },
-        {
-          "name": "alias_default_1489"
-        },
-        {
-          "name": "alias_default_1490"
-        },
-        {
-          "name": "alias_default_1491"
-        },
-        {
-          "name": "alias_default_1492"
-        },
-        {
-          "name": "alias_default_1493"
-        },
-        {
-          "name": "alias_default_1494"
-        },
-        {
-          "name": "alias_default_1495"
-        },
-        {
-          "name": "alias_default_1496"
-        },
-        {
-          "name": "alias_default_1497"
-        },
-        {
-          "name": "alias_default_1498"
-        },
-        {
-          "name": "alias_default_1499"
-        },
-        {
-          "name": "alias_default_1500"
-        },
-        {
-          "name": "alias_default_1501"
-        },
-        {
-          "name": "alias_default_1502"
-        },
-        {
-          "name": "alias_default_1503"
-        },
-        {
-          "name": "alias_default_1504"
-        },
-        {
-          "name": "alias_default_1505"
-        },
-        {
-          "name": "alias_default_1506"
-        },
-        {
-          "name": "alias_default_1507"
-        },
-        {
-          "name": "alias_default_1508"
-        },
-        {
-          "name": "alias_default_1509"
-        },
-        {
-          "name": "alias_default_1510"
-        },
-        {
-          "name": "alias_default_1511"
-        },
-        {
-          "name": "alias_default_1512"
-        },
-        {
-          "name": "alias_default_1513"
-        },
-        {
-          "name": "alias_default_1514"
-        },
-        {
-          "name": "alias_default_1515"
-        },
-        {
-          "name": "alias_default_1516"
-        },
-        {
-          "name": "alias_default_1517"
-        },
-        {
-          "name": "alias_default_1518"
-        },
-        {
-          "name": "alias_default_1519"
-        },
-        {
-          "name": "alias_default_1520"
-        },
-        {
-          "name": "alias_default_1521"
-        },
-        {
-          "name": "alias_default_1522"
-        },
-        {
-          "name": "alias_default_1523"
-        },
-        {
-          "name": "alias_default_1524"
-        },
-        {
-          "name": "alias_default_1525"
-        },
-        {
-          "name": "alias_default_1526"
-        },
-        {
-          "name": "alias_default_1527"
-        },
-        {
-          "name": "alias_default_1528"
-        },
-        {
-          "name": "alias_default_1529"
-        },
-        {
-          "name": "alias_default_1530"
-        },
-        {
-          "name": "alias_default_1531"
-        },
-        {
-          "name": "alias_default_1532"
-        },
-        {
-          "name": "alias_default_1533"
-        },
-        {
-          "name": "alias_default_1534"
-        },
-        {
-          "name": "alias_default_1535"
-        },
-        {
-          "name": "alias_default_1536"
-        },
-        {
-          "name": "alias_default_1537"
-        },
-        {
-          "name": "alias_default_1538"
-        },
-        {
-          "name": "alias_default_1539"
-        },
-        {
-          "name": "alias_default_1540"
-        },
-        {
-          "name": "alias_default_1541"
-        },
-        {
-          "name": "alias_default_1542"
-        },
-        {
-          "name": "alias_default_1543"
-        },
-        {
-          "name": "alias_default_1544"
-        },
-        {
-          "name": "alias_default_1545"
-        },
-        {
-          "name": "alias_default_1546"
-        },
-        {
-          "name": "alias_default_1547"
-        },
-        {
-          "name": "alias_default_1548"
-        },
-        {
-          "name": "alias_default_1549"
-        },
-        {
-          "name": "alias_default_1550"
-        },
-        {
-          "name": "alias_default_1551"
-        },
-        {
-          "name": "alias_default_1552"
-        },
-        {
-          "name": "alias_default_1553"
-        },
-        {
-          "name": "alias_default_1554"
-        },
-        {
-          "name": "alias_default_1555"
-        },
-        {
-          "name": "alias_default_1556"
-        },
-        {
-          "name": "alias_default_1557"
-        },
-        {
-          "name": "alias_default_1558"
-        },
-        {
-          "name": "alias_default_1559"
-        },
-        {
-          "name": "alias_default_1560"
-        },
-        {
-          "name": "alias_default_1561"
-        },
-        {
-          "name": "alias_default_1562"
-        },
-        {
-          "name": "alias_default_1563"
-        },
-        {
-          "name": "alias_default_1564"
-        },
-        {
-          "name": "alias_default_1565"
-        },
-        {
-          "name": "alias_default_1566"
-        },
-        {
-          "name": "alias_default_1567"
-        },
-        {
-          "name": "alias_default_1568"
-        },
-        {
-          "name": "alias_default_1569"
-        },
-        {
-          "name": "alias_default_1570"
-        },
-        {
-          "name": "alias_default_1571"
-        },
-        {
-          "name": "alias_default_1572"
-        },
-        {
-          "name": "alias_default_1573"
-        },
-        {
-          "name": "alias_default_1574"
-        },
-        {
-          "name": "alias_default_1575"
-        },
-        {
-          "name": "alias_default_1576"
-        },
-        {
-          "name": "alias_default_1577"
-        },
-        {
-          "name": "alias_default_1578"
-        },
-        {
-          "name": "alias_default_1579"
-        },
-        {
-          "name": "alias_default_1580"
-        },
-        {
-          "name": "alias_default_1581"
-        },
-        {
-          "name": "alias_default_1582"
-        },
-        {
-          "name": "alias_default_1583"
-        },
-        {
-          "name": "alias_default_1584"
-        },
-        {
-          "name": "alias_default_1585"
-        },
-        {
-          "name": "alias_default_1586"
-        },
-        {
-          "name": "alias_default_1587"
-        },
-        {
-          "name": "alias_default_1588"
-        },
-        {
-          "name": "alias_default_1589"
-        },
-        {
-          "name": "alias_default_1590"
-        },
-        {
-          "name": "alias_default_1591"
-        },
-        {
-          "name": "alias_default_1592"
-        },
-        {
-          "name": "alias_default_1593"
-        },
-        {
-          "name": "alias_default_1594"
-        },
-        {
-          "name": "alias_default_1595"
-        },
-        {
-          "name": "alias_default_1596"
-        },
-        {
-          "name": "alias_default_1597"
-        },
-        {
-          "name": "alias_default_1598"
-        },
-        {
-          "name": "alias_default_1599"
-        },
-        {
-          "name": "alias_default_1600"
-        },
-        {
-          "name": "alias_default_1601"
-        },
-        {
-          "name": "alias_default_1602"
-        },
-        {
-          "name": "alias_default_1603"
-        },
-        {
-          "name": "alias_default_1604"
-        },
-        {
-          "name": "alias_default_1605"
-        },
-        {
-          "name": "alias_default_1606"
-        },
-        {
-          "name": "alias_default_1607"
-        },
-        {
-          "name": "alias_default_1608"
-        },
-        {
-          "name": "alias_default_1609"
-        },
-        {
-          "name": "alias_default_1610"
-        },
-        {
-          "name": "alias_default_1611"
-        },
-        {
-          "name": "alias_default_1612"
-        },
-        {
-          "name": "alias_default_1613"
-        },
-        {
-          "name": "alias_default_1614"
-        },
-        {
-          "name": "alias_default_1615"
-        },
-        {
-          "name": "alias_default_1616"
-        },
-        {
-          "name": "alias_default_1617"
-        },
-        {
-          "name": "alias_default_1618"
-        },
-        {
-          "name": "alias_default_1619"
-        },
-        {
-          "name": "alias_default_1620"
-        },
-        {
-          "name": "alias_default_1621"
-        },
-        {
-          "name": "alias_default_1622"
-        },
-        {
-          "name": "alias_default_1623"
-        },
-        {
-          "name": "alias_default_1624"
-        },
-        {
-          "name": "alias_default_1625"
-        },
-        {
-          "name": "alias_default_1626"
-        },
-        {
-          "name": "alias_default_1627"
-        },
-        {
-          "name": "alias_default_1628"
-        },
-        {
-          "name": "alias_default_1629"
-        },
-        {
-          "name": "alias_default_1630"
-        },
-        {
-          "name": "alias_default_1631"
-        },
-        {
-          "name": "alias_default_1632"
-        },
-        {
-          "name": "alias_default_1633"
-        },
-        {
-          "name": "alias_default_1634"
-        },
-        {
-          "name": "alias_default_1635"
-        },
-        {
-          "name": "alias_default_1636"
-        },
-        {
-          "name": "alias_default_1637"
-        },
-        {
-          "name": "alias_default_1638"
-        },
-        {
-          "name": "alias_default_1639"
-        },
-        {
-          "name": "alias_default_1640"
-        },
-        {
-          "name": "alias_default_1641"
-        },
-        {
-          "name": "alias_default_1642"
-        },
-        {
-          "name": "alias_default_1643"
-        },
-        {
-          "name": "alias_default_1644"
-        },
-        {
-          "name": "alias_default_1645"
-        },
-        {
-          "name": "alias_default_1646"
-        },
-        {
-          "name": "alias_default_1647"
-        },
-        {
-          "name": "alias_default_1648"
-        },
-        {
-          "name": "alias_default_1649"
-        },
-        {
-          "name": "alias_default_1650"
-        },
-        {
-          "name": "alias_default_1651"
-        },
-        {
-          "name": "alias_default_1652"
-        },
-        {
-          "name": "alias_default_1653"
-        },
-        {
-          "name": "alias_default_1654"
-        },
-        {
-          "name": "alias_default_1655"
-        },
-        {
-          "name": "alias_default_1656"
-        },
-        {
-          "name": "alias_default_1657"
-        },
-        {
-          "name": "alias_default_1658"
-        },
-        {
-          "name": "alias_default_1659"
-        },
-        {
-          "name": "alias_default_1660"
-        },
-        {
-          "name": "alias_default_1661"
-        },
-        {
-          "name": "alias_default_1662"
-        },
-        {
-          "name": "alias_default_1663"
-        },
-        {
-          "name": "alias_default_1664"
-        },
-        {
-          "name": "alias_default_1665"
-        },
-        {
-          "name": "alias_default_1666"
-        },
-        {
-          "name": "alias_default_1667"
-        },
-        {
-          "name": "alias_default_1668"
-        },
-        {
-          "name": "alias_default_1669"
-        },
-        {
-          "name": "alias_default_1670"
-        },
-        {
-          "name": "alias_default_1671"
-        },
-        {
-          "name": "alias_default_1672"
-        },
-        {
-          "name": "alias_default_1673"
-        },
-        {
-          "name": "alias_default_1674"
-        },
-        {
-          "name": "alias_default_1675"
-        },
-        {
-          "name": "alias_default_1676"
-        },
-        {
-          "name": "alias_default_1677"
-        },
-        {
-          "name": "alias_default_1678"
-        },
-        {
-          "name": "alias_default_1679"
-        },
-        {
-          "name": "alias_default_1680"
-        },
-        {
-          "name": "alias_default_1681"
-        },
-        {
-          "name": "alias_default_1682"
-        },
-        {
-          "name": "alias_default_1683"
-        },
-        {
-          "name": "alias_default_1684"
-        },
-        {
-          "name": "alias_default_1685"
-        },
-        {
-          "name": "alias_default_1686"
-        },
-        {
-          "name": "alias_default_1687"
-        },
-        {
-          "name": "alias_default_1688"
-        },
-        {
-          "name": "alias_default_1689"
-        },
-        {
-          "name": "alias_default_1690"
-        },
-        {
-          "name": "alias_default_1691"
-        },
-        {
-          "name": "alias_default_1692"
-        },
-        {
-          "name": "alias_default_1693"
-        },
-        {
-          "name": "alias_default_1694"
-        },
-        {
-          "name": "alias_default_1695"
-        },
-        {
-          "name": "alias_default_1696"
-        },
-        {
-          "name": "alias_default_1697"
-        },
-        {
-          "name": "alias_default_1698"
-        },
-        {
-          "name": "alias_default_1699"
-        },
-        {
-          "name": "alias_default_1700"
-        },
-        {
-          "name": "alias_default_1701"
-        },
-        {
-          "name": "alias_default_1702"
-        },
-        {
-          "name": "alias_default_1703"
-        },
-        {
-          "name": "alias_default_1704"
-        },
-        {
-          "name": "alias_default_1705"
-        },
-        {
-          "name": "alias_default_1706"
-        },
-        {
-          "name": "alias_default_1707"
-        },
-        {
-          "name": "alias_default_1708"
-        },
-        {
-          "name": "alias_default_1709"
-        },
-        {
-          "name": "alias_default_1710"
-        },
-        {
-          "name": "alias_default_1711"
-        }
-      ],
-      "name": "output",
-      "op": "output"
-    }
-  ],
-  "summary": {
-    "comm": 212780.17498325979,
-    "compute": 581120.8234224034,
-    "total": 794933.9984056632,
-    "transition": 1033.0
-  }
-}
\ No newline at end of file
diff --git a/profile_results/llama3_8b_4x4_strategy_summary.json b/profile_results/llama3_8b_4x4_strategy_summary.json
deleted file mode 100644
index ccdeb4d9..00000000
--- a/profile_results/llama3_8b_4x4_strategy_summary.json
+++ /dev/null
@@ -1,2054 +0,0 @@
-{
-  "config": {
-    "batch_size": 8,
-    "input_constraint": "Shard(0), Replicate()",
-    "mesh_dim_names": [
-      "dp",
-      "tp"
-    ],
-    "mesh_shape": [
-      4,
-      4
-    ],
-    "model": "autoparallel._testing.models.llama3 Transformer 8B config",
-    "output_constraint": "Shard(0), Shard(2)",
-    "seqlen": 8192,
-    "vocab_size": 128256,
-    "world_size": 16
-  },
-  "elapsed_s": 115.23945621983148,
-  "json_summary": {
-    "comm": 212780.17498325979,
-    "compute": 581120.8234224034,
-    "total": 794933.9984056632,
-    "transition": 1033.0
-  },
-  "optimizer_profile": {
-    "ilp": {
-      "cluster_copied_decision_variables": 8181840,
-      "constraints": 175408,
-      "logical_decision_variables": 8657526,
-      "unique_variables": 475686
-    },
-    "last_solve": {
-      "constraints": 175412,
-      "extract_s": 0.044429945992305875,
-      "kind": "solve",
-      "objective": 794933.998405679,
-      "objective_s": 3.8023465629667044,
-      "pipeline_total_s": 102.16174313612282,
-      "solve_s": 59.80278266593814,
-      "status": "Optimal",
-      "total_s": 63.73084603413008,
-      "unique_variables": 475686
-    },
-    "mesh": {
-      "dim_names": [
-        "dp",
-        "tp"
-      ],
-      "ndim": 2,
-      "shape": [
-        4,
-        4
-      ],
-      "size": 16
-    },
-    "model": {
-      "graph_nodes": 8668,
-      "op_counts": {
-        "call_function": 8373,
-        "output": 1,
-        "placeholder": 294
-      },
-      "parameter_bytes": 32121044992,
-      "parameter_nodes": 291,
-      "parameter_numel": 8030261248,
-      "tensor_nodes": 8667,
-      "unknown_parameter_nodes": 0
-    },
-    "strategies": {
-      "max_strategies_per_node": 81,
-      "nodes": 8668,
-      "option_tuples": 8657526,
-      "strategy_options": 220687
-    },
-    "timings": {
-      "compute_cost_estimation_s": 1.9735342266503721,
-      "constraint_construction_s": 3.2506618059705943,
-      "cost_estimation_s": 4.9254587206523865,
-      "decision_var_build_s": 15.363263476872817,
-      "decision_var_overhead_s": 6.9146421970799565,
-      "edge_cost_estimation_s": 2.9519244940020144,
-      "ilp_construction_s": 13.688466562191024,
-      "init_total_s": 38.43089710199274,
-      "pulp_var_creation_s": 3.5231625591404736,
-      "strategy_enumeration_s": 10.847158421995118,
-      "validation_s": 0.060926787089556456
-    }
-  },
-  "param_strategy_groups": {
-    "layers.*.attention.wk.weight": {
-      "S(0)S(0)": 32
-    },
-    "layers.*.attention.wo.weight": {
-      "S(0)S(0)": 32
-    },
-    "layers.*.attention.wq.weight": {
-      "S(0)S(0)": 32
-    },
-    "layers.*.attention.wv.weight": {
-      "S(0)S(0)": 32
-    },
-    "layers.*.attention_norm.weight": {
-      "S(0)S(0)": 32
-    },
-    "layers.*.feed_forward.w1.weight": {
-      "S(0)S(0)": 32
-    },
-    "layers.*.feed_forward.w2.weight": {
-      "S(0)S(1)": 32
-    },
-    "layers.*.feed_forward.w3.weight": {
-      "S(0)S(0)": 32
-    },
-    "layers.*.ffn_norm.weight": {
-      "S(0)S(0)": 32
-    },
-    "norm.weight": {
-      "S(0)S(0)": 1
-    },
-    "output.weight": {
-      "S(0)S(0)": 1
-    },
-    "tok_embeddings.weight": {
-      "S(1)S(1)": 1
-    }
-  },
-  "phase_placement_counts": {
-    "backward": [
-      [
-        "S(0)S(2)",
-        1634
-      ],
-      [
-        "S(0)S(1)",
-        1423
-      ],
-      [
-        "P(sum)S(0)",
-        354
-      ],
-      [
-        "P(sum)P(sum)",
-        291
-      ],
-      [
-        "S(0)S(0)",
-        258
-      ],
-      [
-        "RR",
-        257
-      ],
-      [
-        "P(sum)S(1)",
-        225
-      ],
-      [
-        "RS(0)",
-        129
-      ],
-      [
-        "S(0)P(sum)",
-        97
-      ],
-      [
-        "S(0)R",
-        32
-      ],
-      [
-        "RS(1)",
-        32
-      ],
-      [
-        "(S(0)S(1), S(0)S(1), S(0)S(1))",
-        32
-      ],
-      [
-        "S(1)S(1)",
-        3
-      ]
-    ],
-    "forward": [
-      [
-        "S(0)S(2)",
-        1378
-      ],
-      [
-        "S(0)S(1)",
-        1227
-      ],
-      [
-        "S(0)S(0)",
-        516
-      ],
-      [
-        "RR",
-        324
-      ],
-      [
-        "RS(1)",
-        258
-      ],
-      [
-        "S(0)R",
-        66
-      ],
-      [
-        "RS(0)",
-        64
-      ],
-      [
-        "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-        32
-      ],
-      [
-        "S(0)P(sum)",
-        32
-      ],
-      [
-        "S(1)S(1)",
-        2
-      ],
-      [
-        "S(2)S(2)",
-        1
-      ]
-    ]
-  },
-  "placement_counts": [
-    [
-      "S(0)S(2)",
-      3012
-    ],
-    [
-      "S(0)S(1)",
-      2650
-    ],
-    [
-      "S(0)S(0)",
-      774
-    ],
-    [
-      "RR",
-      581
-    ],
-    [
-      "P(sum)S(0)",
-      354
-    ],
-    [
-      "P(sum)P(sum)",
-      291
-    ],
-    [
-      "RS(1)",
-      290
-    ],
-    [
-      "P(sum)S(1)",
-      225
-    ],
-    [
-      "RS(0)",
-      193
-    ],
-    [
-      "S(0)P(sum)",
-      129
-    ],
-    [
-      "S(0)R",
-      98
-    ],
-    [
-      "(S(0)S(1), S(0)S(1), None, None, None, None, RR, RR, RR)",
-      32
-    ],
-    [
-      "(S(0)S(1), S(0)S(1), S(0)S(1))",
-      32
-    ],
-    [
-      "S(1)S(1)",
-      5
-    ],
-    [
-      "S(2)S(2)",
-      1
-    ]
-  ],
-  "sample_forward_interesting_nodes": [
-    {
-      "inputs": [],
-      "module_path": "layers.0.attention.wq.weight",
-      "name": "primals_2",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.0.attention.wk.weight",
-      "name": "primals_3",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.0.attention.wv.weight",
-      "name": "primals_4",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.0.attention.wo.weight",
-      "name": "primals_5",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.0.feed_forward.w1.weight",
-      "name": "primals_6",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.0.feed_forward.w2.weight",
-      "name": "primals_7",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.0.feed_forward.w3.weight",
-      "name": "primals_8",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.1.attention.wq.weight",
-      "name": "primals_11",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.1.attention.wk.weight",
-      "name": "primals_12",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.1.attention.wv.weight",
-      "name": "primals_13",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.1.attention.wo.weight",
-      "name": "primals_14",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.1.feed_forward.w1.weight",
-      "name": "primals_15",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.1.feed_forward.w2.weight",
-      "name": "primals_16",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.1.feed_forward.w3.weight",
-      "name": "primals_17",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.2.attention.wq.weight",
-      "name": "primals_20",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.2.attention.wk.weight",
-      "name": "primals_21",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.2.attention.wv.weight",
-      "name": "primals_22",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.2.attention.wo.weight",
-      "name": "primals_23",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.2.feed_forward.w1.weight",
-      "name": "primals_24",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.2.feed_forward.w2.weight",
-      "name": "primals_25",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.2.feed_forward.w3.weight",
-      "name": "primals_26",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.3.attention.wq.weight",
-      "name": "primals_29",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.3.attention.wk.weight",
-      "name": "primals_30",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.3.attention.wv.weight",
-      "name": "primals_31",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.3.attention.wo.weight",
-      "name": "primals_32",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.3.feed_forward.w1.weight",
-      "name": "primals_33",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.3.feed_forward.w2.weight",
-      "name": "primals_34",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.3.feed_forward.w3.weight",
-      "name": "primals_35",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.4.attention.wq.weight",
-      "name": "primals_38",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.4.attention.wk.weight",
-      "name": "primals_39",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.4.attention.wv.weight",
-      "name": "primals_40",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.4.attention.wo.weight",
-      "name": "primals_41",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.4.feed_forward.w1.weight",
-      "name": "primals_42",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.4.feed_forward.w2.weight",
-      "name": "primals_43",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.4.feed_forward.w3.weight",
-      "name": "primals_44",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.5.attention.wq.weight",
-      "name": "primals_47",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.5.attention.wk.weight",
-      "name": "primals_48",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.5.attention.wv.weight",
-      "name": "primals_49",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.5.attention.wo.weight",
-      "name": "primals_50",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.5.feed_forward.w1.weight",
-      "name": "primals_51",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.5.feed_forward.w2.weight",
-      "name": "primals_52",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.5.feed_forward.w3.weight",
-      "name": "primals_53",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.6.attention.wq.weight",
-      "name": "primals_56",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.6.attention.wk.weight",
-      "name": "primals_57",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.6.attention.wv.weight",
-      "name": "primals_58",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.6.attention.wo.weight",
-      "name": "primals_59",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.6.feed_forward.w1.weight",
-      "name": "primals_60",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.6.feed_forward.w2.weight",
-      "name": "primals_61",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.6.feed_forward.w3.weight",
-      "name": "primals_62",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.7.attention.wq.weight",
-      "name": "primals_65",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.7.attention.wk.weight",
-      "name": "primals_66",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.7.attention.wv.weight",
-      "name": "primals_67",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.7.attention.wo.weight",
-      "name": "primals_68",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.7.feed_forward.w1.weight",
-      "name": "primals_69",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.7.feed_forward.w2.weight",
-      "name": "primals_70",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.7.feed_forward.w3.weight",
-      "name": "primals_71",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.8.attention.wq.weight",
-      "name": "primals_74",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.8.attention.wk.weight",
-      "name": "primals_75",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.8.attention.wv.weight",
-      "name": "primals_76",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.8.attention.wo.weight",
-      "name": "primals_77",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.8.feed_forward.w1.weight",
-      "name": "primals_78",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.8.feed_forward.w2.weight",
-      "name": "primals_79",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.8.feed_forward.w3.weight",
-      "name": "primals_80",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.9.attention.wq.weight",
-      "name": "primals_83",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.9.attention.wk.weight",
-      "name": "primals_84",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.9.attention.wv.weight",
-      "name": "primals_85",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.9.attention.wo.weight",
-      "name": "primals_86",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.9.feed_forward.w1.weight",
-      "name": "primals_87",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.9.feed_forward.w2.weight",
-      "name": "primals_88",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.9.feed_forward.w3.weight",
-      "name": "primals_89",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.10.attention.wq.weight",
-      "name": "primals_92",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.10.attention.wk.weight",
-      "name": "primals_93",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.10.attention.wv.weight",
-      "name": "primals_94",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.10.attention.wo.weight",
-      "name": "primals_95",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.10.feed_forward.w1.weight",
-      "name": "primals_96",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.10.feed_forward.w2.weight",
-      "name": "primals_97",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.10.feed_forward.w3.weight",
-      "name": "primals_98",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.11.attention.wq.weight",
-      "name": "primals_101",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.11.attention.wk.weight",
-      "name": "primals_102",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.11.attention.wv.weight",
-      "name": "primals_103",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.11.attention.wo.weight",
-      "name": "primals_104",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.11.feed_forward.w1.weight",
-      "name": "primals_105",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.11.feed_forward.w2.weight",
-      "name": "primals_106",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.11.feed_forward.w3.weight",
-      "name": "primals_107",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.12.attention.wq.weight",
-      "name": "primals_110",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.12.attention.wk.weight",
-      "name": "primals_111",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.12.attention.wv.weight",
-      "name": "primals_112",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.12.attention.wo.weight",
-      "name": "primals_113",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.12.feed_forward.w1.weight",
-      "name": "primals_114",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.12.feed_forward.w2.weight",
-      "name": "primals_115",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.12.feed_forward.w3.weight",
-      "name": "primals_116",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.13.attention.wq.weight",
-      "name": "primals_119",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.13.attention.wk.weight",
-      "name": "primals_120",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.13.attention.wv.weight",
-      "name": "primals_121",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.13.attention.wo.weight",
-      "name": "primals_122",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.13.feed_forward.w1.weight",
-      "name": "primals_123",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.13.feed_forward.w2.weight",
-      "name": "primals_124",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.13.feed_forward.w3.weight",
-      "name": "primals_125",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.14.attention.wq.weight",
-      "name": "primals_128",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.14.attention.wk.weight",
-      "name": "primals_129",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.14.attention.wv.weight",
-      "name": "primals_130",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.14.attention.wo.weight",
-      "name": "primals_131",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.14.feed_forward.w1.weight",
-      "name": "primals_132",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.14.feed_forward.w2.weight",
-      "name": "primals_133",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.14.feed_forward.w3.weight",
-      "name": "primals_134",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.15.attention.wq.weight",
-      "name": "primals_137",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.15.attention.wk.weight",
-      "name": "primals_138",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.15.attention.wv.weight",
-      "name": "primals_139",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.15.attention.wo.weight",
-      "name": "primals_140",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.15.feed_forward.w1.weight",
-      "name": "primals_141",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.15.feed_forward.w2.weight",
-      "name": "primals_142",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.15.feed_forward.w3.weight",
-      "name": "primals_143",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.16.attention.wq.weight",
-      "name": "primals_146",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.16.attention.wk.weight",
-      "name": "primals_147",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.16.attention.wv.weight",
-      "name": "primals_148",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.16.attention.wo.weight",
-      "name": "primals_149",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.16.feed_forward.w1.weight",
-      "name": "primals_150",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.16.feed_forward.w2.weight",
-      "name": "primals_151",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.16.feed_forward.w3.weight",
-      "name": "primals_152",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.17.attention.wq.weight",
-      "name": "primals_155",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.17.attention.wk.weight",
-      "name": "primals_156",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.17.attention.wv.weight",
-      "name": "primals_157",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.17.attention.wo.weight",
-      "name": "primals_158",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.17.feed_forward.w1.weight",
-      "name": "primals_159",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.17.feed_forward.w2.weight",
-      "name": "primals_160",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.17.feed_forward.w3.weight",
-      "name": "primals_161",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.18.attention.wq.weight",
-      "name": "primals_164",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.18.attention.wk.weight",
-      "name": "primals_165",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.18.attention.wv.weight",
-      "name": "primals_166",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.18.attention.wo.weight",
-      "name": "primals_167",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.18.feed_forward.w1.weight",
-      "name": "primals_168",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.18.feed_forward.w2.weight",
-      "name": "primals_169",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.18.feed_forward.w3.weight",
-      "name": "primals_170",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.19.attention.wq.weight",
-      "name": "primals_173",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.19.attention.wk.weight",
-      "name": "primals_174",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.19.attention.wv.weight",
-      "name": "primals_175",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.19.attention.wo.weight",
-      "name": "primals_176",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.19.feed_forward.w1.weight",
-      "name": "primals_177",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.19.feed_forward.w2.weight",
-      "name": "primals_178",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.19.feed_forward.w3.weight",
-      "name": "primals_179",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.20.attention.wq.weight",
-      "name": "primals_182",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.20.attention.wk.weight",
-      "name": "primals_183",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.20.attention.wv.weight",
-      "name": "primals_184",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.20.attention.wo.weight",
-      "name": "primals_185",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.20.feed_forward.w1.weight",
-      "name": "primals_186",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.20.feed_forward.w2.weight",
-      "name": "primals_187",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.20.feed_forward.w3.weight",
-      "name": "primals_188",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.21.attention.wq.weight",
-      "name": "primals_191",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.21.attention.wk.weight",
-      "name": "primals_192",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.21.attention.wv.weight",
-      "name": "primals_193",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.21.attention.wo.weight",
-      "name": "primals_194",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.21.feed_forward.w1.weight",
-      "name": "primals_195",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.21.feed_forward.w2.weight",
-      "name": "primals_196",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.21.feed_forward.w3.weight",
-      "name": "primals_197",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.22.attention.wq.weight",
-      "name": "primals_200",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.22.attention.wk.weight",
-      "name": "primals_201",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.22.attention.wv.weight",
-      "name": "primals_202",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        1024,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.22.attention.wo.weight",
-      "name": "primals_203",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        4096,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.22.feed_forward.w1.weight",
-      "name": "primals_204",
-      "op": "placeholder",
-      "placement": "S(0)S(0)",
-      "shape": [
-        14336,
-        4096
-      ]
-    },
-    {
-      "inputs": [],
-      "module_path": "layers.22.feed_forward.w2.weight",
-      "name": "primals_205",
-      "op": "placeholder",
-      "placement": "S(0)S(1)",
-      "shape": [
-        4096,
-        14336
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/profile_results/real_llama3_3b_dag_node_stats.csv b/profile_results/real_llama3_3b_dag_node_stats.csv
deleted file mode 100644
index 5f813f1b..00000000
--- a/profile_results/real_llama3_3b_dag_node_stats.csv
+++ /dev/null
@@ -1,7200 +0,0 @@
-idx,name,op,target,phase,layer,direct_dependency_args,direct_dependency_nodes,direct_offspring_nodes,ancestor_count,descendant_count,strategy_count
-0,primals_1,placeholder,primals_1,unknown,,0,0,1,0,5816,3
-1,primals_2,placeholder,primals_2,unknown,,0,0,1,0,5777,3
-2,primals_3,placeholder,primals_3,unknown,,0,0,1,0,5777,3
-3,primals_4,placeholder,primals_4,unknown,,0,0,1,0,5770,3
-4,primals_5,placeholder,primals_5,unknown,,0,0,1,0,5757,3
-5,primals_6,placeholder,primals_6,unknown,,0,0,1,0,5737,3
-6,primals_7,placeholder,primals_7,unknown,,0,0,1,0,5714,3
-7,primals_8,placeholder,primals_8,unknown,,0,0,1,0,5718,3
-8,primals_9,placeholder,primals_9,unknown,,0,0,1,0,5794,2
-9,primals_10,placeholder,primals_10,unknown,,0,0,1,0,5741,2
-10,primals_11,placeholder,primals_11,unknown,,0,0,1,0,5681,3
-11,primals_12,placeholder,primals_12,unknown,,0,0,1,0,5681,3
-12,primals_13,placeholder,primals_13,unknown,,0,0,1,0,5674,3
-13,primals_14,placeholder,primals_14,unknown,,0,0,1,0,5661,3
-14,primals_15,placeholder,primals_15,unknown,,0,0,1,0,5641,3
-15,primals_16,placeholder,primals_16,unknown,,0,0,1,0,5618,3
-16,primals_17,placeholder,primals_17,unknown,,0,0,1,0,5622,3
-17,primals_18,placeholder,primals_18,unknown,,0,0,1,0,5698,2
-18,primals_19,placeholder,primals_19,unknown,,0,0,1,0,5645,2
-19,primals_20,placeholder,primals_20,unknown,,0,0,1,0,5585,3
-20,primals_21,placeholder,primals_21,unknown,,0,0,1,0,5585,3
-21,primals_22,placeholder,primals_22,unknown,,0,0,1,0,5578,3
-22,primals_23,placeholder,primals_23,unknown,,0,0,1,0,5565,3
-23,primals_24,placeholder,primals_24,unknown,,0,0,1,0,5545,3
-24,primals_25,placeholder,primals_25,unknown,,0,0,1,0,5522,3
-25,primals_26,placeholder,primals_26,unknown,,0,0,1,0,5526,3
-26,primals_27,placeholder,primals_27,unknown,,0,0,1,0,5602,2
-27,primals_28,placeholder,primals_28,unknown,,0,0,1,0,5549,2
-28,primals_29,placeholder,primals_29,unknown,,0,0,1,0,5489,3
-29,primals_30,placeholder,primals_30,unknown,,0,0,1,0,5489,3
-30,primals_31,placeholder,primals_31,unknown,,0,0,1,0,5482,3
-31,primals_32,placeholder,primals_32,unknown,,0,0,1,0,5469,3
-32,primals_33,placeholder,primals_33,unknown,,0,0,1,0,5449,3
-33,primals_34,placeholder,primals_34,unknown,,0,0,1,0,5426,3
-34,primals_35,placeholder,primals_35,unknown,,0,0,1,0,5430,3
-35,primals_36,placeholder,primals_36,unknown,,0,0,1,0,5506,2
-36,primals_37,placeholder,primals_37,unknown,,0,0,1,0,5453,2
-37,primals_38,placeholder,primals_38,unknown,,0,0,1,0,5393,3
-38,primals_39,placeholder,primals_39,unknown,,0,0,1,0,5393,3
-39,primals_40,placeholder,primals_40,unknown,,0,0,1,0,5386,3
-40,primals_41,placeholder,primals_41,unknown,,0,0,1,0,5373,3
-41,primals_42,placeholder,primals_42,unknown,,0,0,1,0,5353,3
-42,primals_43,placeholder,primals_43,unknown,,0,0,1,0,5330,3
-43,primals_44,placeholder,primals_44,unknown,,0,0,1,0,5334,3
-44,primals_45,placeholder,primals_45,unknown,,0,0,1,0,5410,2
-45,primals_46,placeholder,primals_46,unknown,,0,0,1,0,5357,2
-46,primals_47,placeholder,primals_47,unknown,,0,0,1,0,5297,3
-47,primals_48,placeholder,primals_48,unknown,,0,0,1,0,5297,3
-48,primals_49,placeholder,primals_49,unknown,,0,0,1,0,5290,3
-49,primals_50,placeholder,primals_50,unknown,,0,0,1,0,5277,3
-50,primals_51,placeholder,primals_51,unknown,,0,0,1,0,5257,3
-51,primals_52,placeholder,primals_52,unknown,,0,0,1,0,5234,3
-52,primals_53,placeholder,primals_53,unknown,,0,0,1,0,5238,3
-53,primals_54,placeholder,primals_54,unknown,,0,0,1,0,5314,2
-54,primals_55,placeholder,primals_55,unknown,,0,0,1,0,5261,2
-55,primals_56,placeholder,primals_56,unknown,,0,0,1,0,5201,3
-56,primals_57,placeholder,primals_57,unknown,,0,0,1,0,5201,3
-57,primals_58,placeholder,primals_58,unknown,,0,0,1,0,5194,3
-58,primals_59,placeholder,primals_59,unknown,,0,0,1,0,5181,3
-59,primals_60,placeholder,primals_60,unknown,,0,0,1,0,5161,3
-60,primals_61,placeholder,primals_61,unknown,,0,0,1,0,5138,3
-61,primals_62,placeholder,primals_62,unknown,,0,0,1,0,5142,3
-62,primals_63,placeholder,primals_63,unknown,,0,0,1,0,5218,2
-63,primals_64,placeholder,primals_64,unknown,,0,0,1,0,5165,2
-64,primals_65,placeholder,primals_65,unknown,,0,0,1,0,5105,3
-65,primals_66,placeholder,primals_66,unknown,,0,0,1,0,5105,3
-66,primals_67,placeholder,primals_67,unknown,,0,0,1,0,5098,3
-67,primals_68,placeholder,primals_68,unknown,,0,0,1,0,5085,3
-68,primals_69,placeholder,primals_69,unknown,,0,0,1,0,5065,3
-69,primals_70,placeholder,primals_70,unknown,,0,0,1,0,5042,3
-70,primals_71,placeholder,primals_71,unknown,,0,0,1,0,5046,3
-71,primals_72,placeholder,primals_72,unknown,,0,0,1,0,5122,2
-72,primals_73,placeholder,primals_73,unknown,,0,0,1,0,5069,2
-73,primals_74,placeholder,primals_74,unknown,,0,0,1,0,5009,3
-74,primals_75,placeholder,primals_75,unknown,,0,0,1,0,5009,3
-75,primals_76,placeholder,primals_76,unknown,,0,0,1,0,5002,3
-76,primals_77,placeholder,primals_77,unknown,,0,0,1,0,4989,3
-77,primals_78,placeholder,primals_78,unknown,,0,0,1,0,4969,3
-78,primals_79,placeholder,primals_79,unknown,,0,0,1,0,4946,3
-79,primals_80,placeholder,primals_80,unknown,,0,0,1,0,4950,3
-80,primals_81,placeholder,primals_81,unknown,,0,0,1,0,5026,2
-81,primals_82,placeholder,primals_82,unknown,,0,0,1,0,4973,2
-82,primals_83,placeholder,primals_83,unknown,,0,0,1,0,4913,3
-83,primals_84,placeholder,primals_84,unknown,,0,0,1,0,4913,3
-84,primals_85,placeholder,primals_85,unknown,,0,0,1,0,4906,3
-85,primals_86,placeholder,primals_86,unknown,,0,0,1,0,4893,3
-86,primals_87,placeholder,primals_87,unknown,,0,0,1,0,4873,3
-87,primals_88,placeholder,primals_88,unknown,,0,0,1,0,4850,3
-88,primals_89,placeholder,primals_89,unknown,,0,0,1,0,4854,3
-89,primals_90,placeholder,primals_90,unknown,,0,0,1,0,4930,2
-90,primals_91,placeholder,primals_91,unknown,,0,0,1,0,4877,2
-91,primals_92,placeholder,primals_92,unknown,,0,0,1,0,4817,3
-92,primals_93,placeholder,primals_93,unknown,,0,0,1,0,4817,3
-93,primals_94,placeholder,primals_94,unknown,,0,0,1,0,4810,3
-94,primals_95,placeholder,primals_95,unknown,,0,0,1,0,4797,3
-95,primals_96,placeholder,primals_96,unknown,,0,0,1,0,4777,3
-96,primals_97,placeholder,primals_97,unknown,,0,0,1,0,4754,3
-97,primals_98,placeholder,primals_98,unknown,,0,0,1,0,4758,3
-98,primals_99,placeholder,primals_99,unknown,,0,0,1,0,4834,2
-99,primals_100,placeholder,primals_100,unknown,,0,0,1,0,4781,2
-100,primals_101,placeholder,primals_101,unknown,,0,0,1,0,4721,3
-101,primals_102,placeholder,primals_102,unknown,,0,0,1,0,4721,3
-102,primals_103,placeholder,primals_103,unknown,,0,0,1,0,4714,3
-103,primals_104,placeholder,primals_104,unknown,,0,0,1,0,4701,3
-104,primals_105,placeholder,primals_105,unknown,,0,0,1,0,4681,3
-105,primals_106,placeholder,primals_106,unknown,,0,0,1,0,4658,3
-106,primals_107,placeholder,primals_107,unknown,,0,0,1,0,4662,3
-107,primals_108,placeholder,primals_108,unknown,,0,0,1,0,4738,2
-108,primals_109,placeholder,primals_109,unknown,,0,0,1,0,4685,2
-109,primals_110,placeholder,primals_110,unknown,,0,0,1,0,4625,3
-110,primals_111,placeholder,primals_111,unknown,,0,0,1,0,4625,3
-111,primals_112,placeholder,primals_112,unknown,,0,0,1,0,4618,3
-112,primals_113,placeholder,primals_113,unknown,,0,0,1,0,4605,3
-113,primals_114,placeholder,primals_114,unknown,,0,0,1,0,4585,3
-114,primals_115,placeholder,primals_115,unknown,,0,0,1,0,4562,3
-115,primals_116,placeholder,primals_116,unknown,,0,0,1,0,4566,3
-116,primals_117,placeholder,primals_117,unknown,,0,0,1,0,4642,2
-117,primals_118,placeholder,primals_118,unknown,,0,0,1,0,4589,2
-118,primals_119,placeholder,primals_119,unknown,,0,0,1,0,4529,3
-119,primals_120,placeholder,primals_120,unknown,,0,0,1,0,4529,3
-120,primals_121,placeholder,primals_121,unknown,,0,0,1,0,4522,3
-121,primals_122,placeholder,primals_122,unknown,,0,0,1,0,4509,3
-122,primals_123,placeholder,primals_123,unknown,,0,0,1,0,4489,3
-123,primals_124,placeholder,primals_124,unknown,,0,0,1,0,4466,3
-124,primals_125,placeholder,primals_125,unknown,,0,0,1,0,4470,3
-125,primals_126,placeholder,primals_126,unknown,,0,0,1,0,4546,2
-126,primals_127,placeholder,primals_127,unknown,,0,0,1,0,4493,2
-127,primals_128,placeholder,primals_128,unknown,,0,0,1,0,4433,3
-128,primals_129,placeholder,primals_129,unknown,,0,0,1,0,4433,3
-129,primals_130,placeholder,primals_130,unknown,,0,0,1,0,4426,3
-130,primals_131,placeholder,primals_131,unknown,,0,0,1,0,4413,3
-131,primals_132,placeholder,primals_132,unknown,,0,0,1,0,4393,3
-132,primals_133,placeholder,primals_133,unknown,,0,0,1,0,4370,3
-133,primals_134,placeholder,primals_134,unknown,,0,0,1,0,4374,3
-134,primals_135,placeholder,primals_135,unknown,,0,0,1,0,4450,2
-135,primals_136,placeholder,primals_136,unknown,,0,0,1,0,4397,2
-136,primals_137,placeholder,primals_137,unknown,,0,0,1,0,4337,3
-137,primals_138,placeholder,primals_138,unknown,,0,0,1,0,4337,3
-138,primals_139,placeholder,primals_139,unknown,,0,0,1,0,4330,3
-139,primals_140,placeholder,primals_140,unknown,,0,0,1,0,4317,3
-140,primals_141,placeholder,primals_141,unknown,,0,0,1,0,4297,3
-141,primals_142,placeholder,primals_142,unknown,,0,0,1,0,4274,3
-142,primals_143,placeholder,primals_143,unknown,,0,0,1,0,4278,3
-143,primals_144,placeholder,primals_144,unknown,,0,0,1,0,4354,2
-144,primals_145,placeholder,primals_145,unknown,,0,0,1,0,4301,2
-145,primals_146,placeholder,primals_146,unknown,,0,0,1,0,4241,3
-146,primals_147,placeholder,primals_147,unknown,,0,0,1,0,4241,3
-147,primals_148,placeholder,primals_148,unknown,,0,0,1,0,4234,3
-148,primals_149,placeholder,primals_149,unknown,,0,0,1,0,4221,3
-149,primals_150,placeholder,primals_150,unknown,,0,0,1,0,4201,3
-150,primals_151,placeholder,primals_151,unknown,,0,0,1,0,4178,3
-151,primals_152,placeholder,primals_152,unknown,,0,0,1,0,4182,3
-152,primals_153,placeholder,primals_153,unknown,,0,0,1,0,4258,2
-153,primals_154,placeholder,primals_154,unknown,,0,0,1,0,4205,2
-154,primals_155,placeholder,primals_155,unknown,,0,0,1,0,4145,3
-155,primals_156,placeholder,primals_156,unknown,,0,0,1,0,4145,3
-156,primals_157,placeholder,primals_157,unknown,,0,0,1,0,4138,3
-157,primals_158,placeholder,primals_158,unknown,,0,0,1,0,4125,3
-158,primals_159,placeholder,primals_159,unknown,,0,0,1,0,4105,3
-159,primals_160,placeholder,primals_160,unknown,,0,0,1,0,4082,3
-160,primals_161,placeholder,primals_161,unknown,,0,0,1,0,4086,3
-161,primals_162,placeholder,primals_162,unknown,,0,0,1,0,4162,2
-162,primals_163,placeholder,primals_163,unknown,,0,0,1,0,4109,2
-163,primals_164,placeholder,primals_164,unknown,,0,0,1,0,4049,3
-164,primals_165,placeholder,primals_165,unknown,,0,0,1,0,4049,3
-165,primals_166,placeholder,primals_166,unknown,,0,0,1,0,4042,3
-166,primals_167,placeholder,primals_167,unknown,,0,0,1,0,4029,3
-167,primals_168,placeholder,primals_168,unknown,,0,0,1,0,4009,3
-168,primals_169,placeholder,primals_169,unknown,,0,0,1,0,3986,3
-169,primals_170,placeholder,primals_170,unknown,,0,0,1,0,3990,3
-170,primals_171,placeholder,primals_171,unknown,,0,0,1,0,4066,2
-171,primals_172,placeholder,primals_172,unknown,,0,0,1,0,4013,2
-172,primals_173,placeholder,primals_173,unknown,,0,0,1,0,3953,3
-173,primals_174,placeholder,primals_174,unknown,,0,0,1,0,3953,3
-174,primals_175,placeholder,primals_175,unknown,,0,0,1,0,3946,3
-175,primals_176,placeholder,primals_176,unknown,,0,0,1,0,3933,3
-176,primals_177,placeholder,primals_177,unknown,,0,0,1,0,3913,3
-177,primals_178,placeholder,primals_178,unknown,,0,0,1,0,3890,3
-178,primals_179,placeholder,primals_179,unknown,,0,0,1,0,3894,3
-179,primals_180,placeholder,primals_180,unknown,,0,0,1,0,3970,2
-180,primals_181,placeholder,primals_181,unknown,,0,0,1,0,3917,2
-181,primals_182,placeholder,primals_182,unknown,,0,0,1,0,3857,3
-182,primals_183,placeholder,primals_183,unknown,,0,0,1,0,3857,3
-183,primals_184,placeholder,primals_184,unknown,,0,0,1,0,3850,3
-184,primals_185,placeholder,primals_185,unknown,,0,0,1,0,3837,3
-185,primals_186,placeholder,primals_186,unknown,,0,0,1,0,3817,3
-186,primals_187,placeholder,primals_187,unknown,,0,0,1,0,3794,3
-187,primals_188,placeholder,primals_188,unknown,,0,0,1,0,3798,3
-188,primals_189,placeholder,primals_189,unknown,,0,0,1,0,3874,2
-189,primals_190,placeholder,primals_190,unknown,,0,0,1,0,3821,2
-190,primals_191,placeholder,primals_191,unknown,,0,0,1,0,3761,3
-191,primals_192,placeholder,primals_192,unknown,,0,0,1,0,3761,3
-192,primals_193,placeholder,primals_193,unknown,,0,0,1,0,3754,3
-193,primals_194,placeholder,primals_194,unknown,,0,0,1,0,3741,3
-194,primals_195,placeholder,primals_195,unknown,,0,0,1,0,3721,3
-195,primals_196,placeholder,primals_196,unknown,,0,0,1,0,3698,3
-196,primals_197,placeholder,primals_197,unknown,,0,0,1,0,3702,3
-197,primals_198,placeholder,primals_198,unknown,,0,0,1,0,3778,2
-198,primals_199,placeholder,primals_199,unknown,,0,0,1,0,3725,2
-199,primals_200,placeholder,primals_200,unknown,,0,0,1,0,3665,3
-200,primals_201,placeholder,primals_201,unknown,,0,0,1,0,3665,3
-201,primals_202,placeholder,primals_202,unknown,,0,0,1,0,3658,3
-202,primals_203,placeholder,primals_203,unknown,,0,0,1,0,3645,3
-203,primals_204,placeholder,primals_204,unknown,,0,0,1,0,3625,3
-204,primals_205,placeholder,primals_205,unknown,,0,0,1,0,3602,3
-205,primals_206,placeholder,primals_206,unknown,,0,0,1,0,3606,3
-206,primals_207,placeholder,primals_207,unknown,,0,0,1,0,3682,2
-207,primals_208,placeholder,primals_208,unknown,,0,0,1,0,3629,2
-208,primals_209,placeholder,primals_209,unknown,,0,0,1,0,3569,3
-209,primals_210,placeholder,primals_210,unknown,,0,0,1,0,3569,3
-210,primals_211,placeholder,primals_211,unknown,,0,0,1,0,3562,3
-211,primals_212,placeholder,primals_212,unknown,,0,0,1,0,3549,3
-212,primals_213,placeholder,primals_213,unknown,,0,0,1,0,3529,3
-213,primals_214,placeholder,primals_214,unknown,,0,0,1,0,3506,3
-214,primals_215,placeholder,primals_215,unknown,,0,0,1,0,3510,3
-215,primals_216,placeholder,primals_216,unknown,,0,0,1,0,3586,2
-216,primals_217,placeholder,primals_217,unknown,,0,0,1,0,3533,2
-217,primals_218,placeholder,primals_218,unknown,,0,0,1,0,3473,3
-218,primals_219,placeholder,primals_219,unknown,,0,0,1,0,3473,3
-219,primals_220,placeholder,primals_220,unknown,,0,0,1,0,3466,3
-220,primals_221,placeholder,primals_221,unknown,,0,0,1,0,3453,3
-221,primals_222,placeholder,primals_222,unknown,,0,0,1,0,3433,3
-222,primals_223,placeholder,primals_223,unknown,,0,0,1,0,3410,3
-223,primals_224,placeholder,primals_224,unknown,,0,0,1,0,3414,3
-224,primals_225,placeholder,primals_225,unknown,,0,0,1,0,3490,2
-225,primals_226,placeholder,primals_226,unknown,,0,0,1,0,3437,2
-226,primals_227,placeholder,primals_227,unknown,,0,0,1,0,3377,3
-227,primals_228,placeholder,primals_228,unknown,,0,0,1,0,3377,3
-228,primals_229,placeholder,primals_229,unknown,,0,0,1,0,3370,3
-229,primals_230,placeholder,primals_230,unknown,,0,0,1,0,3357,3
-230,primals_231,placeholder,primals_231,unknown,,0,0,1,0,3337,3
-231,primals_232,placeholder,primals_232,unknown,,0,0,1,0,3314,3
-232,primals_233,placeholder,primals_233,unknown,,0,0,1,0,3318,3
-233,primals_234,placeholder,primals_234,unknown,,0,0,1,0,3394,2
-234,primals_235,placeholder,primals_235,unknown,,0,0,1,0,3341,2
-235,primals_236,placeholder,primals_236,unknown,,0,0,1,0,3281,3
-236,primals_237,placeholder,primals_237,unknown,,0,0,1,0,3281,3
-237,primals_238,placeholder,primals_238,unknown,,0,0,1,0,3274,3
-238,primals_239,placeholder,primals_239,unknown,,0,0,1,0,3261,3
-239,primals_240,placeholder,primals_240,unknown,,0,0,1,0,3241,3
-240,primals_241,placeholder,primals_241,unknown,,0,0,1,0,3218,3
-241,primals_242,placeholder,primals_242,unknown,,0,0,1,0,3222,3
-242,primals_243,placeholder,primals_243,unknown,,0,0,1,0,3298,2
-243,primals_244,placeholder,primals_244,unknown,,0,0,1,0,3245,2
-244,primals_245,placeholder,primals_245,unknown,,0,0,1,0,3185,3
-245,primals_246,placeholder,primals_246,unknown,,0,0,1,0,3185,3
-246,primals_247,placeholder,primals_247,unknown,,0,0,1,0,3178,3
-247,primals_248,placeholder,primals_248,unknown,,0,0,1,0,3165,3
-248,primals_249,placeholder,primals_249,unknown,,0,0,1,0,3145,3
-249,primals_250,placeholder,primals_250,unknown,,0,0,1,0,3122,3
-250,primals_251,placeholder,primals_251,unknown,,0,0,1,0,3126,3
-251,primals_252,placeholder,primals_252,unknown,,0,0,1,0,3202,2
-252,primals_253,placeholder,primals_253,unknown,,0,0,1,0,3149,2
-253,primals_254,placeholder,primals_254,unknown,,0,0,1,0,3103,2
-254,primals_255,placeholder,primals_255,unknown,,0,0,1,0,5943,3
-255,primals_256,placeholder,primals_256,unknown,,0,0,1,0,5806,3
-256,tangents_1,placeholder,tangents_1,backward,,0,0,1,0,3104,4
-257,alias_default,call_function,alias.default,unknown,,1,1,2,1,5815,3
-258,dtype_cast,call_function,dtype_cast.default,forward,,1,1,1,2,5805,3
-259,alias_default_2,call_function,alias.default,unknown,,1,1,2,1,5805,3
-260,embedding,call_function,embedding.default,forward,,2,2,1,5,5804,5
-261,dtype_cast_1,call_function,dtype_cast.default,forward,0,1,1,1,1,5793,2
-262,alias_default_4,call_function,alias.default,forward,,1,1,3,6,5803,4
-263,convert_element_type,call_function,convert_element_type.default,forward,0,1,1,1,7,5801,4
-264,alias_default_6,call_function,alias.default,forward,0,1,1,2,8,5800,4
-265,pow_1,call_function,pow.Tensor_Scalar,forward,0,1,1,1,9,5799,4
-266,mean,call_function,mean.dim,forward,0,1,1,1,10,5798,4
-267,add,call_function,add.Scalar,forward,0,1,1,1,11,5797,3
-268,rsqrt,call_function,rsqrt.default,forward,0,1,1,1,12,5796,3
-269,alias_default_7,call_function,alias.default,forward,0,1,1,3,13,5795,3
-270,mul,call_function,mul.Tensor,forward,0,2,2,1,14,5791,8
-271,alias_default_5,call_function,alias.default,forward,0,1,1,2,2,5792,2
-272,mul_1,call_function,mul.Tensor,forward,0,2,2,1,18,5790,8
-273,convert_element_type_1,call_function,convert_element_type.default,forward,0,1,1,1,19,5789,6
-274,dtype_cast_2,call_function,dtype_cast.default,forward,0,1,1,1,1,5776,3
-275,permute,call_function,permute.default,forward,0,1,1,1,2,5775,3
-276,alias_default_8,call_function,alias.default,forward,0,1,1,6,20,5788,4
-277,alias_default_9,call_function,alias.default,forward,0,1,1,2,3,5774,3
-278,einsum_default,call_function,einsum.default,forward,0,2,2,1,25,5772,5
-279,dtype_cast_3,call_function,dtype_cast.default,forward,0,1,1,1,1,5776,3
-280,permute_1,call_function,permute.default,forward,0,1,1,1,2,5775,3
-281,alias_default_10,call_function,alias.default,forward,0,1,1,2,3,5774,3
-282,einsum_default_1,call_function,einsum.default,forward,0,2,2,1,25,5772,5
-283,dtype_cast_4,call_function,dtype_cast.default,forward,0,1,1,1,1,5769,3
-284,permute_2,call_function,permute.default,forward,0,1,1,1,2,5768,3
-285,alias_default_11,call_function,alias.default,forward,0,1,1,2,3,5767,3
-286,einsum_default_2,call_function,einsum.default,forward,0,2,2,1,25,5765,5
-287,view_6,call_function,view.default,forward,0,1,1,1,26,5771,4
-288,view_7,call_function,view.default,forward,0,1,1,1,26,5771,4
-289,view_8,call_function,view.default,forward,0,1,1,1,26,5764,4
-290,convert_element_type_8,call_function,convert_element_type.default,forward,0,1,1,1,27,5770,4
-291,view_9,call_function,view.default,forward,0,1,1,1,28,5769,4
-292,view_as_complex,call_function,view_as_complex.default,forward,0,1,1,1,29,5768,6
-293,convert_element_type_9,call_function,convert_element_type.default,forward,0,1,1,1,27,5770,4
-294,view_10,call_function,view.default,forward,0,1,1,1,28,5769,4
-295,view_as_complex_1,call_function,view_as_complex.default,forward,0,1,1,1,29,5768,6
-296,alias_default_1,call_function,alias.default,unknown,,1,1,28,1,5942,3
-297,view_11,call_function,view.default,forward,0,1,1,1,2,5779,3
-298,alias_default_12,call_function,alias.default,forward,0,1,1,4,3,5778,3
-299,mul_2,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8
-300,view_as_real,call_function,view_as_real.default,forward,0,1,1,1,35,5766,6
-301,view_12,call_function,view.default,forward,0,1,1,1,36,5765,6
-302,mul_3,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8
-303,view_as_real_1,call_function,view_as_real.default,forward,0,1,1,1,35,5766,6
-304,view_13,call_function,view.default,forward,0,1,1,1,36,5765,6
-305,convert_element_type_10,call_function,convert_element_type.default,forward,0,1,1,1,37,5764,6
-306,convert_element_type_11,call_function,convert_element_type.default,forward,0,1,1,1,37,5764,6
-307,permute_3,call_function,permute.default,forward,0,1,1,1,38,5763,6
-308,permute_4,call_function,permute.default,forward,0,1,1,1,38,5763,6
-309,permute_5,call_function,permute.default,forward,0,1,1,1,27,5763,4
-310,alias_default_13,call_function,alias.default,forward,0,1,1,2,39,5762,4
-311,alias_default_14,call_function,alias.default,forward,0,1,1,2,39,5762,4
-312,alias_default_15,call_function,alias.default,forward,0,1,1,2,28,5762,4
-313,_scaled_dot_product_flash_attention,call_function,_scaled_dot_product_flash_attention.default,forward,0,3,3,4,63,5761,2
-314,getitem,call_function,getitem,forward,0,1,1,1,64,5757,2
-315,getitem_1,call_function,getitem,forward,0,1,1,1,64,64,2
-316,getitem_6,call_function,getitem,forward,0,1,1,1,64,64,1
-317,getitem_7,call_function,getitem,forward,0,1,1,1,64,64,1
-318,alias_default_16,call_function,alias.default,forward,0,1,1,2,65,5756,4
-319,permute_6,call_function,permute.default,forward,0,1,1,1,66,5755,4
-320,view_14,call_function,view.default,forward,0,1,1,1,67,5754,3
-321,dtype_cast_5,call_function,dtype_cast.default,forward,0,1,1,1,1,5756,3
-322,permute_7,call_function,permute.default,forward,0,1,1,1,2,5755,3
-323,alias_default_17,call_function,alias.default,forward,0,1,1,2,68,5753,4
-324,alias_default_18,call_function,alias.default,forward,0,1,1,2,3,5754,3
-325,einsum_default_3,call_function,einsum.default,forward,0,2,2,1,73,5752,5
-326,add_1,call_function,add.Tensor,forward,0,2,2,1,74,5751,10
-327,dtype_cast_6,call_function,dtype_cast.default,forward,0,1,1,1,1,5740,2
-328,alias_default_19,call_function,alias.default,forward,0,1,1,3,75,5750,4
-329,convert_element_type_14,call_function,convert_element_type.default,forward,0,1,1,1,76,5748,4
-330,alias_default_21,call_function,alias.default,forward,0,1,1,2,77,5747,4
-331,pow_2,call_function,pow.Tensor_Scalar,forward,0,1,1,1,78,5746,4
-332,mean_1,call_function,mean.dim,forward,0,1,1,1,79,5745,4
-333,add_2,call_function,add.Scalar,forward,0,1,1,1,80,5744,3
-334,rsqrt_1,call_function,rsqrt.default,forward,0,1,1,1,81,5743,3
-335,alias_default_22,call_function,alias.default,forward,0,1,1,3,82,5742,3
-336,mul_4,call_function,mul.Tensor,forward,0,2,2,1,83,5738,8
-337,alias_default_20,call_function,alias.default,forward,0,1,1,2,2,5739,2
-338,mul_5,call_function,mul.Tensor,forward,0,2,2,1,87,5737,8
-339,convert_element_type_15,call_function,convert_element_type.default,forward,0,1,1,1,88,5736,6
-340,dtype_cast_7,call_function,dtype_cast.default,forward,0,1,1,1,1,5736,3
-341,permute_8,call_function,permute.default,forward,0,1,1,1,2,5735,3
-342,alias_default_23,call_function,alias.default,forward,0,1,1,4,89,5735,4
-343,alias_default_24,call_function,alias.default,forward,0,1,1,2,3,5734,3
-344,einsum_default_4,call_function,einsum.default,forward,0,2,2,1,94,5732,5
-345,alias_default_25,call_function,alias.default,forward,0,1,1,2,95,5731,4
-346,convert_element_type_18,call_function,convert_element_type.default,forward,0,1,1,1,96,5719,4
-347,alias_default_26,call_function,alias.default,forward,0,1,1,2,97,5718,4
-348,neg,call_function,neg.default,forward,0,1,1,1,98,5717,8
-349,exp,call_function,exp.default,forward,0,1,1,1,99,5716,6
-350,add_3,call_function,add.Tensor,forward,0,1,1,1,100,5715,4
-351,div,call_function,div.Tensor,forward,0,2,2,1,101,5714,6
-352,convert_element_type_19,call_function,convert_element_type.default,forward,0,1,1,1,102,5713,6
-353,dtype_cast_8,call_function,dtype_cast.default,forward,0,1,1,1,1,5717,3
-354,permute_9,call_function,permute.default,forward,0,1,1,1,2,5716,3
-355,alias_default_28,call_function,alias.default,forward,0,1,1,2,3,5715,3
-356,einsum_default_5,call_function,einsum.default,forward,0,2,2,1,94,5713,5
-357,alias_default_27,call_function,alias.default,forward,0,1,1,2,103,5712,4
-358,alias_default_29,call_function,alias.default,forward,0,1,1,2,95,5712,4
-359,mul_6,call_function,mul.Tensor,forward,0,2,2,1,110,5711,8
-360,dtype_cast_9,call_function,dtype_cast.default,forward,0,1,1,1,1,5713,3
-361,permute_10,call_function,permute.default,forward,0,1,1,1,2,5712,3
-362,alias_default_30,call_function,alias.default,forward,0,1,1,2,111,5710,4
-363,alias_default_31,call_function,alias.default,forward,0,1,1,2,3,5711,3
-364,einsum_default_6,call_function,einsum.default,forward,0,2,2,1,116,5709,5
-365,add_4,call_function,add.Tensor,forward,0,2,2,1,117,5708,10
-366,dtype_cast_10,call_function,dtype_cast.default,forward,1,1,1,1,1,5697,2
-367,alias_default_32,call_function,alias.default,forward,0,1,1,3,118,5707,4
-368,convert_element_type_24,call_function,convert_element_type.default,forward,1,1,1,1,119,5705,4
-369,alias_default_34,call_function,alias.default,forward,1,1,1,2,120,5704,4
-370,pow_3,call_function,pow.Tensor_Scalar,forward,1,1,1,1,121,5703,4
-371,mean_2,call_function,mean.dim,forward,1,1,1,1,122,5702,4
-372,add_5,call_function,add.Scalar,forward,1,1,1,1,123,5701,3
-373,rsqrt_2,call_function,rsqrt.default,forward,1,1,1,1,124,5700,3
-374,alias_default_35,call_function,alias.default,forward,1,1,1,3,125,5699,3
-375,mul_7,call_function,mul.Tensor,forward,1,2,2,1,126,5695,8
-376,alias_default_33,call_function,alias.default,forward,1,1,1,2,2,5696,2
-377,mul_8,call_function,mul.Tensor,forward,1,2,2,1,130,5694,8
-378,convert_element_type_25,call_function,convert_element_type.default,forward,1,1,1,1,131,5693,6
-379,dtype_cast_11,call_function,dtype_cast.default,forward,1,1,1,1,1,5680,3
-380,permute_11,call_function,permute.default,forward,1,1,1,1,2,5679,3
-381,alias_default_36,call_function,alias.default,forward,1,1,1,6,132,5692,4
-382,alias_default_37,call_function,alias.default,forward,1,1,1,2,3,5678,3
-383,einsum_default_7,call_function,einsum.default,forward,1,2,2,1,137,5676,5
-384,dtype_cast_12,call_function,dtype_cast.default,forward,1,1,1,1,1,5680,3
-385,permute_12,call_function,permute.default,forward,1,1,1,1,2,5679,3
-386,alias_default_38,call_function,alias.default,forward,1,1,1,2,3,5678,3
-387,einsum_default_8,call_function,einsum.default,forward,1,2,2,1,137,5676,5
-388,dtype_cast_13,call_function,dtype_cast.default,forward,1,1,1,1,1,5673,3
-389,permute_13,call_function,permute.default,forward,1,1,1,1,2,5672,3
-390,alias_default_39,call_function,alias.default,forward,1,1,1,2,3,5671,3
-391,einsum_default_9,call_function,einsum.default,forward,1,2,2,1,137,5669,5
-392,view_29,call_function,view.default,forward,1,1,1,1,138,5675,4
-393,view_30,call_function,view.default,forward,1,1,1,1,138,5675,4
-394,view_31,call_function,view.default,forward,1,1,1,1,138,5668,4
-395,convert_element_type_32,call_function,convert_element_type.default,forward,1,1,1,1,139,5674,4
-396,view_32,call_function,view.default,forward,1,1,1,1,140,5673,4
-397,view_as_complex_2,call_function,view_as_complex.default,forward,1,1,1,1,141,5672,6
-398,convert_element_type_33,call_function,convert_element_type.default,forward,1,1,1,1,139,5674,4
-399,view_33,call_function,view.default,forward,1,1,1,1,140,5673,4
-400,view_as_complex_3,call_function,view_as_complex.default,forward,1,1,1,1,141,5672,6
-401,view_34,call_function,view.default,forward,1,1,1,1,2,5683,3
-402,alias_default_40,call_function,alias.default,forward,1,1,1,4,3,5682,3
-403,mul_9,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8
-404,view_as_real_2,call_function,view_as_real.default,forward,1,1,1,1,145,5670,6
-405,view_35,call_function,view.default,forward,1,1,1,1,146,5669,6
-406,mul_10,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8
-407,view_as_real_3,call_function,view_as_real.default,forward,1,1,1,1,145,5670,6
-408,view_36,call_function,view.default,forward,1,1,1,1,146,5669,6
-409,convert_element_type_34,call_function,convert_element_type.default,forward,1,1,1,1,147,5668,6
-410,convert_element_type_35,call_function,convert_element_type.default,forward,1,1,1,1,147,5668,6
-411,permute_14,call_function,permute.default,forward,1,1,1,1,148,5667,6
-412,permute_15,call_function,permute.default,forward,1,1,1,1,148,5667,6
-413,permute_16,call_function,permute.default,forward,1,1,1,1,139,5667,4
-414,alias_default_41,call_function,alias.default,forward,1,1,1,2,149,5666,4
-415,alias_default_42,call_function,alias.default,forward,1,1,1,2,149,5666,4
-416,alias_default_43,call_function,alias.default,forward,1,1,1,2,140,5666,4
-417,_scaled_dot_product_flash_attention_1,call_function,_scaled_dot_product_flash_attention.default,forward,1,3,3,4,173,5665,2
-418,getitem_9,call_function,getitem,forward,1,1,1,1,174,5661,2
-419,getitem_10,call_function,getitem,forward,1,1,1,1,174,174,2
-420,getitem_15,call_function,getitem,forward,1,1,1,1,174,174,1
-421,getitem_16,call_function,getitem,forward,1,1,1,1,174,174,1
-422,alias_default_44,call_function,alias.default,forward,1,1,1,2,175,5660,4
-423,permute_17,call_function,permute.default,forward,1,1,1,1,176,5659,4
-424,view_37,call_function,view.default,forward,1,1,1,1,177,5658,3
-425,dtype_cast_14,call_function,dtype_cast.default,forward,1,1,1,1,1,5660,3
-426,permute_18,call_function,permute.default,forward,1,1,1,1,2,5659,3
-427,alias_default_45,call_function,alias.default,forward,1,1,1,2,178,5657,4
-428,alias_default_46,call_function,alias.default,forward,1,1,1,2,3,5658,3
-429,einsum_default_10,call_function,einsum.default,forward,1,2,2,1,183,5656,5
-430,add_6,call_function,add.Tensor,forward,1,2,2,1,184,5655,10
-431,dtype_cast_15,call_function,dtype_cast.default,forward,1,1,1,1,1,5644,2
-432,alias_default_47,call_function,alias.default,forward,1,1,1,3,185,5654,4
-433,convert_element_type_38,call_function,convert_element_type.default,forward,1,1,1,1,186,5652,4
-434,alias_default_49,call_function,alias.default,forward,1,1,1,2,187,5651,4
-435,pow_4,call_function,pow.Tensor_Scalar,forward,1,1,1,1,188,5650,4
-436,mean_3,call_function,mean.dim,forward,1,1,1,1,189,5649,4
-437,add_7,call_function,add.Scalar,forward,1,1,1,1,190,5648,3
-438,rsqrt_3,call_function,rsqrt.default,forward,1,1,1,1,191,5647,3
-439,alias_default_50,call_function,alias.default,forward,1,1,1,3,192,5646,3
-440,mul_11,call_function,mul.Tensor,forward,1,2,2,1,193,5642,8
-441,alias_default_48,call_function,alias.default,forward,1,1,1,2,2,5643,2
-442,mul_12,call_function,mul.Tensor,forward,1,2,2,1,197,5641,8
-443,convert_element_type_39,call_function,convert_element_type.default,forward,1,1,1,1,198,5640,6
-444,dtype_cast_16,call_function,dtype_cast.default,forward,1,1,1,1,1,5640,3
-445,permute_19,call_function,permute.default,forward,1,1,1,1,2,5639,3
-446,alias_default_51,call_function,alias.default,forward,1,1,1,4,199,5639,4
-447,alias_default_52,call_function,alias.default,forward,1,1,1,2,3,5638,3
-448,einsum_default_11,call_function,einsum.default,forward,1,2,2,1,204,5636,5
-449,alias_default_53,call_function,alias.default,forward,1,1,1,2,205,5635,4
-450,convert_element_type_42,call_function,convert_element_type.default,forward,1,1,1,1,206,5623,4
-451,alias_default_54,call_function,alias.default,forward,1,1,1,2,207,5622,4
-452,neg_1,call_function,neg.default,forward,1,1,1,1,208,5621,8
-453,exp_1,call_function,exp.default,forward,1,1,1,1,209,5620,6
-454,add_8,call_function,add.Tensor,forward,1,1,1,1,210,5619,4
-455,div_1,call_function,div.Tensor,forward,1,2,2,1,211,5618,6
-456,convert_element_type_43,call_function,convert_element_type.default,forward,1,1,1,1,212,5617,6
-457,dtype_cast_17,call_function,dtype_cast.default,forward,1,1,1,1,1,5621,3
-458,permute_20,call_function,permute.default,forward,1,1,1,1,2,5620,3
-459,alias_default_56,call_function,alias.default,forward,1,1,1,2,3,5619,3
-460,einsum_default_12,call_function,einsum.default,forward,1,2,2,1,204,5617,5
-461,alias_default_55,call_function,alias.default,forward,1,1,1,2,213,5616,4
-462,alias_default_57,call_function,alias.default,forward,1,1,1,2,205,5616,4
-463,mul_13,call_function,mul.Tensor,forward,1,2,2,1,220,5615,8
-464,dtype_cast_18,call_function,dtype_cast.default,forward,1,1,1,1,1,5617,3
-465,permute_21,call_function,permute.default,forward,1,1,1,1,2,5616,3
-466,alias_default_58,call_function,alias.default,forward,1,1,1,2,221,5614,4
-467,alias_default_59,call_function,alias.default,forward,1,1,1,2,3,5615,3
-468,einsum_default_13,call_function,einsum.default,forward,1,2,2,1,226,5613,5
-469,add_9,call_function,add.Tensor,forward,1,2,2,1,227,5612,10
-470,dtype_cast_19,call_function,dtype_cast.default,forward,2,1,1,1,1,5601,2
-471,alias_default_60,call_function,alias.default,forward,1,1,1,3,228,5611,4
-472,convert_element_type_48,call_function,convert_element_type.default,forward,2,1,1,1,229,5609,4
-473,alias_default_62,call_function,alias.default,forward,2,1,1,2,230,5608,4
-474,pow_5,call_function,pow.Tensor_Scalar,forward,2,1,1,1,231,5607,4
-475,mean_4,call_function,mean.dim,forward,2,1,1,1,232,5606,4
-476,add_10,call_function,add.Scalar,forward,2,1,1,1,233,5605,3
-477,rsqrt_4,call_function,rsqrt.default,forward,2,1,1,1,234,5604,3
-478,alias_default_63,call_function,alias.default,forward,2,1,1,3,235,5603,3
-479,mul_14,call_function,mul.Tensor,forward,2,2,2,1,236,5599,8
-480,alias_default_61,call_function,alias.default,forward,2,1,1,2,2,5600,2
-481,mul_15,call_function,mul.Tensor,forward,2,2,2,1,240,5598,8
-482,convert_element_type_49,call_function,convert_element_type.default,forward,2,1,1,1,241,5597,6
-483,dtype_cast_20,call_function,dtype_cast.default,forward,2,1,1,1,1,5584,3
-484,permute_22,call_function,permute.default,forward,2,1,1,1,2,5583,3
-485,alias_default_64,call_function,alias.default,forward,2,1,1,6,242,5596,4
-486,alias_default_65,call_function,alias.default,forward,2,1,1,2,3,5582,3
-487,einsum_default_14,call_function,einsum.default,forward,2,2,2,1,247,5580,5
-488,dtype_cast_21,call_function,dtype_cast.default,forward,2,1,1,1,1,5584,3
-489,permute_23,call_function,permute.default,forward,2,1,1,1,2,5583,3
-490,alias_default_66,call_function,alias.default,forward,2,1,1,2,3,5582,3
-491,einsum_default_15,call_function,einsum.default,forward,2,2,2,1,247,5580,5
-492,dtype_cast_22,call_function,dtype_cast.default,forward,2,1,1,1,1,5577,3
-493,permute_24,call_function,permute.default,forward,2,1,1,1,2,5576,3
-494,alias_default_67,call_function,alias.default,forward,2,1,1,2,3,5575,3
-495,einsum_default_16,call_function,einsum.default,forward,2,2,2,1,247,5573,5
-496,view_52,call_function,view.default,forward,2,1,1,1,248,5579,4
-497,view_53,call_function,view.default,forward,2,1,1,1,248,5579,4
-498,view_54,call_function,view.default,forward,2,1,1,1,248,5572,4
-499,convert_element_type_56,call_function,convert_element_type.default,forward,2,1,1,1,249,5578,4
-500,view_55,call_function,view.default,forward,2,1,1,1,250,5577,4
-501,view_as_complex_4,call_function,view_as_complex.default,forward,2,1,1,1,251,5576,6
-502,convert_element_type_57,call_function,convert_element_type.default,forward,2,1,1,1,249,5578,4
-503,view_56,call_function,view.default,forward,2,1,1,1,250,5577,4
-504,view_as_complex_5,call_function,view_as_complex.default,forward,2,1,1,1,251,5576,6
-505,view_57,call_function,view.default,forward,2,1,1,1,2,5587,3
-506,alias_default_68,call_function,alias.default,forward,2,1,1,4,3,5586,3
-507,mul_16,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8
-508,view_as_real_4,call_function,view_as_real.default,forward,2,1,1,1,255,5574,6
-509,view_58,call_function,view.default,forward,2,1,1,1,256,5573,6
-510,mul_17,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8
-511,view_as_real_5,call_function,view_as_real.default,forward,2,1,1,1,255,5574,6
-512,view_59,call_function,view.default,forward,2,1,1,1,256,5573,6
-513,convert_element_type_58,call_function,convert_element_type.default,forward,2,1,1,1,257,5572,6
-514,convert_element_type_59,call_function,convert_element_type.default,forward,2,1,1,1,257,5572,6
-515,permute_25,call_function,permute.default,forward,2,1,1,1,258,5571,6
-516,permute_26,call_function,permute.default,forward,2,1,1,1,258,5571,6
-517,permute_27,call_function,permute.default,forward,2,1,1,1,249,5571,4
-518,alias_default_69,call_function,alias.default,forward,2,1,1,2,259,5570,4
-519,alias_default_70,call_function,alias.default,forward,2,1,1,2,259,5570,4
-520,alias_default_71,call_function,alias.default,forward,2,1,1,2,250,5570,4
-521,_scaled_dot_product_flash_attention_2,call_function,_scaled_dot_product_flash_attention.default,forward,2,3,3,4,283,5569,2
-522,getitem_18,call_function,getitem,forward,2,1,1,1,284,5565,2
-523,getitem_19,call_function,getitem,forward,2,1,1,1,284,284,2
-524,getitem_24,call_function,getitem,forward,2,1,1,1,284,284,1
-525,getitem_25,call_function,getitem,forward,2,1,1,1,284,284,1
-526,alias_default_72,call_function,alias.default,forward,2,1,1,2,285,5564,4
-527,permute_28,call_function,permute.default,forward,2,1,1,1,286,5563,4
-528,view_60,call_function,view.default,forward,2,1,1,1,287,5562,3
-529,dtype_cast_23,call_function,dtype_cast.default,forward,2,1,1,1,1,5564,3
-530,permute_29,call_function,permute.default,forward,2,1,1,1,2,5563,3
-531,alias_default_73,call_function,alias.default,forward,2,1,1,2,288,5561,4
-532,alias_default_74,call_function,alias.default,forward,2,1,1,2,3,5562,3
-533,einsum_default_17,call_function,einsum.default,forward,2,2,2,1,293,5560,5
-534,add_11,call_function,add.Tensor,forward,2,2,2,1,294,5559,10
-535,dtype_cast_24,call_function,dtype_cast.default,forward,2,1,1,1,1,5548,2
-536,alias_default_75,call_function,alias.default,forward,2,1,1,3,295,5558,4
-537,convert_element_type_62,call_function,convert_element_type.default,forward,2,1,1,1,296,5556,4
-538,alias_default_77,call_function,alias.default,forward,2,1,1,2,297,5555,4
-539,pow_6,call_function,pow.Tensor_Scalar,forward,2,1,1,1,298,5554,4
-540,mean_5,call_function,mean.dim,forward,2,1,1,1,299,5553,4
-541,add_12,call_function,add.Scalar,forward,2,1,1,1,300,5552,3
-542,rsqrt_5,call_function,rsqrt.default,forward,2,1,1,1,301,5551,3
-543,alias_default_78,call_function,alias.default,forward,2,1,1,3,302,5550,3
-544,mul_18,call_function,mul.Tensor,forward,2,2,2,1,303,5546,8
-545,alias_default_76,call_function,alias.default,forward,2,1,1,2,2,5547,2
-546,mul_19,call_function,mul.Tensor,forward,2,2,2,1,307,5545,8
-547,convert_element_type_63,call_function,convert_element_type.default,forward,2,1,1,1,308,5544,6
-548,dtype_cast_25,call_function,dtype_cast.default,forward,2,1,1,1,1,5544,3
-549,permute_30,call_function,permute.default,forward,2,1,1,1,2,5543,3
-550,alias_default_79,call_function,alias.default,forward,2,1,1,4,309,5543,4
-551,alias_default_80,call_function,alias.default,forward,2,1,1,2,3,5542,3
-552,einsum_default_18,call_function,einsum.default,forward,2,2,2,1,314,5540,5
-553,alias_default_81,call_function,alias.default,forward,2,1,1,2,315,5539,4
-554,convert_element_type_66,call_function,convert_element_type.default,forward,2,1,1,1,316,5527,4
-555,alias_default_82,call_function,alias.default,forward,2,1,1,2,317,5526,4
-556,neg_2,call_function,neg.default,forward,2,1,1,1,318,5525,8
-557,exp_2,call_function,exp.default,forward,2,1,1,1,319,5524,6
-558,add_13,call_function,add.Tensor,forward,2,1,1,1,320,5523,4
-559,div_2,call_function,div.Tensor,forward,2,2,2,1,321,5522,6
-560,convert_element_type_67,call_function,convert_element_type.default,forward,2,1,1,1,322,5521,6
-561,dtype_cast_26,call_function,dtype_cast.default,forward,2,1,1,1,1,5525,3
-562,permute_31,call_function,permute.default,forward,2,1,1,1,2,5524,3
-563,alias_default_84,call_function,alias.default,forward,2,1,1,2,3,5523,3
-564,einsum_default_19,call_function,einsum.default,forward,2,2,2,1,314,5521,5
-565,alias_default_83,call_function,alias.default,forward,2,1,1,2,323,5520,4
-566,alias_default_85,call_function,alias.default,forward,2,1,1,2,315,5520,4
-567,mul_20,call_function,mul.Tensor,forward,2,2,2,1,330,5519,8
-568,dtype_cast_27,call_function,dtype_cast.default,forward,2,1,1,1,1,5521,3
-569,permute_32,call_function,permute.default,forward,2,1,1,1,2,5520,3
-570,alias_default_86,call_function,alias.default,forward,2,1,1,2,331,5518,4
-571,alias_default_87,call_function,alias.default,forward,2,1,1,2,3,5519,3
-572,einsum_default_20,call_function,einsum.default,forward,2,2,2,1,336,5517,5
-573,add_14,call_function,add.Tensor,forward,2,2,2,1,337,5516,10
-574,dtype_cast_28,call_function,dtype_cast.default,forward,3,1,1,1,1,5505,2
-575,alias_default_88,call_function,alias.default,forward,2,1,1,3,338,5515,4
-576,convert_element_type_72,call_function,convert_element_type.default,forward,3,1,1,1,339,5513,4
-577,alias_default_90,call_function,alias.default,forward,3,1,1,2,340,5512,4
-578,pow_7,call_function,pow.Tensor_Scalar,forward,3,1,1,1,341,5511,4
-579,mean_6,call_function,mean.dim,forward,3,1,1,1,342,5510,4
-580,add_15,call_function,add.Scalar,forward,3,1,1,1,343,5509,3
-581,rsqrt_6,call_function,rsqrt.default,forward,3,1,1,1,344,5508,3
-582,alias_default_91,call_function,alias.default,forward,3,1,1,3,345,5507,3
-583,mul_21,call_function,mul.Tensor,forward,3,2,2,1,346,5503,8
-584,alias_default_89,call_function,alias.default,forward,3,1,1,2,2,5504,2
-585,mul_22,call_function,mul.Tensor,forward,3,2,2,1,350,5502,8
-586,convert_element_type_73,call_function,convert_element_type.default,forward,3,1,1,1,351,5501,6
-587,dtype_cast_29,call_function,dtype_cast.default,forward,3,1,1,1,1,5488,3
-588,permute_33,call_function,permute.default,forward,3,1,1,1,2,5487,3
-589,alias_default_92,call_function,alias.default,forward,3,1,1,6,352,5500,4
-590,alias_default_93,call_function,alias.default,forward,3,1,1,2,3,5486,3
-591,einsum_default_21,call_function,einsum.default,forward,3,2,2,1,357,5484,5
-592,dtype_cast_30,call_function,dtype_cast.default,forward,3,1,1,1,1,5488,3
-593,permute_34,call_function,permute.default,forward,3,1,1,1,2,5487,3
-594,alias_default_94,call_function,alias.default,forward,3,1,1,2,3,5486,3
-595,einsum_default_22,call_function,einsum.default,forward,3,2,2,1,357,5484,5
-596,dtype_cast_31,call_function,dtype_cast.default,forward,3,1,1,1,1,5481,3
-597,permute_35,call_function,permute.default,forward,3,1,1,1,2,5480,3
-598,alias_default_95,call_function,alias.default,forward,3,1,1,2,3,5479,3
-599,einsum_default_23,call_function,einsum.default,forward,3,2,2,1,357,5477,5
-600,view_75,call_function,view.default,forward,3,1,1,1,358,5483,4
-601,view_76,call_function,view.default,forward,3,1,1,1,358,5483,4
-602,view_77,call_function,view.default,forward,3,1,1,1,358,5476,4
-603,convert_element_type_80,call_function,convert_element_type.default,forward,3,1,1,1,359,5482,4
-604,view_78,call_function,view.default,forward,3,1,1,1,360,5481,4
-605,view_as_complex_6,call_function,view_as_complex.default,forward,3,1,1,1,361,5480,6
-606,convert_element_type_81,call_function,convert_element_type.default,forward,3,1,1,1,359,5482,4
-607,view_79,call_function,view.default,forward,3,1,1,1,360,5481,4
-608,view_as_complex_7,call_function,view_as_complex.default,forward,3,1,1,1,361,5480,6
-609,view_80,call_function,view.default,forward,3,1,1,1,2,5491,3
-610,alias_default_96,call_function,alias.default,forward,3,1,1,4,3,5490,3
-611,mul_23,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8
-612,view_as_real_6,call_function,view_as_real.default,forward,3,1,1,1,365,5478,6
-613,view_81,call_function,view.default,forward,3,1,1,1,366,5477,6
-614,mul_24,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8
-615,view_as_real_7,call_function,view_as_real.default,forward,3,1,1,1,365,5478,6
-616,view_82,call_function,view.default,forward,3,1,1,1,366,5477,6
-617,convert_element_type_82,call_function,convert_element_type.default,forward,3,1,1,1,367,5476,6
-618,convert_element_type_83,call_function,convert_element_type.default,forward,3,1,1,1,367,5476,6
-619,permute_36,call_function,permute.default,forward,3,1,1,1,368,5475,6
-620,permute_37,call_function,permute.default,forward,3,1,1,1,368,5475,6
-621,permute_38,call_function,permute.default,forward,3,1,1,1,359,5475,4
-622,alias_default_97,call_function,alias.default,forward,3,1,1,2,369,5474,4
-623,alias_default_98,call_function,alias.default,forward,3,1,1,2,369,5474,4
-624,alias_default_99,call_function,alias.default,forward,3,1,1,2,360,5474,4
-625,_scaled_dot_product_flash_attention_3,call_function,_scaled_dot_product_flash_attention.default,forward,3,3,3,4,393,5473,2
-626,getitem_27,call_function,getitem,forward,3,1,1,1,394,5469,2
-627,getitem_28,call_function,getitem,forward,3,1,1,1,394,394,2
-628,getitem_33,call_function,getitem,forward,3,1,1,1,394,394,1
-629,getitem_34,call_function,getitem,forward,3,1,1,1,394,394,1
-630,alias_default_100,call_function,alias.default,forward,3,1,1,2,395,5468,4
-631,permute_39,call_function,permute.default,forward,3,1,1,1,396,5467,4
-632,view_83,call_function,view.default,forward,3,1,1,1,397,5466,3
-633,dtype_cast_32,call_function,dtype_cast.default,forward,3,1,1,1,1,5468,3
-634,permute_40,call_function,permute.default,forward,3,1,1,1,2,5467,3
-635,alias_default_101,call_function,alias.default,forward,3,1,1,2,398,5465,4
-636,alias_default_102,call_function,alias.default,forward,3,1,1,2,3,5466,3
-637,einsum_default_24,call_function,einsum.default,forward,3,2,2,1,403,5464,5
-638,add_16,call_function,add.Tensor,forward,3,2,2,1,404,5463,10
-639,dtype_cast_33,call_function,dtype_cast.default,forward,3,1,1,1,1,5452,2
-640,alias_default_103,call_function,alias.default,forward,3,1,1,3,405,5462,4
-641,convert_element_type_86,call_function,convert_element_type.default,forward,3,1,1,1,406,5460,4
-642,alias_default_105,call_function,alias.default,forward,3,1,1,2,407,5459,4
-643,pow_8,call_function,pow.Tensor_Scalar,forward,3,1,1,1,408,5458,4
-644,mean_7,call_function,mean.dim,forward,3,1,1,1,409,5457,4
-645,add_17,call_function,add.Scalar,forward,3,1,1,1,410,5456,3
-646,rsqrt_7,call_function,rsqrt.default,forward,3,1,1,1,411,5455,3
-647,alias_default_106,call_function,alias.default,forward,3,1,1,3,412,5454,3
-648,mul_25,call_function,mul.Tensor,forward,3,2,2,1,413,5450,8
-649,alias_default_104,call_function,alias.default,forward,3,1,1,2,2,5451,2
-650,mul_26,call_function,mul.Tensor,forward,3,2,2,1,417,5449,8
-651,convert_element_type_87,call_function,convert_element_type.default,forward,3,1,1,1,418,5448,6
-652,dtype_cast_34,call_function,dtype_cast.default,forward,3,1,1,1,1,5448,3
-653,permute_41,call_function,permute.default,forward,3,1,1,1,2,5447,3
-654,alias_default_107,call_function,alias.default,forward,3,1,1,4,419,5447,4
-655,alias_default_108,call_function,alias.default,forward,3,1,1,2,3,5446,3
-656,einsum_default_25,call_function,einsum.default,forward,3,2,2,1,424,5444,5
-657,alias_default_109,call_function,alias.default,forward,3,1,1,2,425,5443,4
-658,convert_element_type_90,call_function,convert_element_type.default,forward,3,1,1,1,426,5431,4
-659,alias_default_110,call_function,alias.default,forward,3,1,1,2,427,5430,4
-660,neg_3,call_function,neg.default,forward,3,1,1,1,428,5429,8
-661,exp_3,call_function,exp.default,forward,3,1,1,1,429,5428,6
-662,add_18,call_function,add.Tensor,forward,3,1,1,1,430,5427,4
-663,div_3,call_function,div.Tensor,forward,3,2,2,1,431,5426,6
-664,convert_element_type_91,call_function,convert_element_type.default,forward,3,1,1,1,432,5425,6
-665,dtype_cast_35,call_function,dtype_cast.default,forward,3,1,1,1,1,5429,3
-666,permute_42,call_function,permute.default,forward,3,1,1,1,2,5428,3
-667,alias_default_112,call_function,alias.default,forward,3,1,1,2,3,5427,3
-668,einsum_default_26,call_function,einsum.default,forward,3,2,2,1,424,5425,5
-669,alias_default_111,call_function,alias.default,forward,3,1,1,2,433,5424,4
-670,alias_default_113,call_function,alias.default,forward,3,1,1,2,425,5424,4
-671,mul_27,call_function,mul.Tensor,forward,3,2,2,1,440,5423,8
-672,dtype_cast_36,call_function,dtype_cast.default,forward,3,1,1,1,1,5425,3
-673,permute_43,call_function,permute.default,forward,3,1,1,1,2,5424,3
-674,alias_default_114,call_function,alias.default,forward,3,1,1,2,441,5422,4
-675,alias_default_115,call_function,alias.default,forward,3,1,1,2,3,5423,3
-676,einsum_default_27,call_function,einsum.default,forward,3,2,2,1,446,5421,5
-677,add_19,call_function,add.Tensor,forward,3,2,2,1,447,5420,10
-678,dtype_cast_37,call_function,dtype_cast.default,forward,4,1,1,1,1,5409,2
-679,alias_default_116,call_function,alias.default,forward,3,1,1,3,448,5419,4
-680,convert_element_type_96,call_function,convert_element_type.default,forward,4,1,1,1,449,5417,4
-681,alias_default_118,call_function,alias.default,forward,4,1,1,2,450,5416,4
-682,pow_9,call_function,pow.Tensor_Scalar,forward,4,1,1,1,451,5415,4
-683,mean_8,call_function,mean.dim,forward,4,1,1,1,452,5414,4
-684,add_20,call_function,add.Scalar,forward,4,1,1,1,453,5413,3
-685,rsqrt_8,call_function,rsqrt.default,forward,4,1,1,1,454,5412,3
-686,alias_default_119,call_function,alias.default,forward,4,1,1,3,455,5411,3
-687,mul_28,call_function,mul.Tensor,forward,4,2,2,1,456,5407,8
-688,alias_default_117,call_function,alias.default,forward,4,1,1,2,2,5408,2
-689,mul_29,call_function,mul.Tensor,forward,4,2,2,1,460,5406,8
-690,convert_element_type_97,call_function,convert_element_type.default,forward,4,1,1,1,461,5405,6
-691,dtype_cast_38,call_function,dtype_cast.default,forward,4,1,1,1,1,5392,3
-692,permute_44,call_function,permute.default,forward,4,1,1,1,2,5391,3
-693,alias_default_120,call_function,alias.default,forward,4,1,1,6,462,5404,4
-694,alias_default_121,call_function,alias.default,forward,4,1,1,2,3,5390,3
-695,einsum_default_28,call_function,einsum.default,forward,4,2,2,1,467,5388,5
-696,dtype_cast_39,call_function,dtype_cast.default,forward,4,1,1,1,1,5392,3
-697,permute_45,call_function,permute.default,forward,4,1,1,1,2,5391,3
-698,alias_default_122,call_function,alias.default,forward,4,1,1,2,3,5390,3
-699,einsum_default_29,call_function,einsum.default,forward,4,2,2,1,467,5388,5
-700,dtype_cast_40,call_function,dtype_cast.default,forward,4,1,1,1,1,5385,3
-701,permute_46,call_function,permute.default,forward,4,1,1,1,2,5384,3
-702,alias_default_123,call_function,alias.default,forward,4,1,1,2,3,5383,3
-703,einsum_default_30,call_function,einsum.default,forward,4,2,2,1,467,5381,5
-704,view_98,call_function,view.default,forward,4,1,1,1,468,5387,4
-705,view_99,call_function,view.default,forward,4,1,1,1,468,5387,4
-706,view_100,call_function,view.default,forward,4,1,1,1,468,5380,4
-707,convert_element_type_104,call_function,convert_element_type.default,forward,4,1,1,1,469,5386,4
-708,view_101,call_function,view.default,forward,4,1,1,1,470,5385,4
-709,view_as_complex_8,call_function,view_as_complex.default,forward,4,1,1,1,471,5384,6
-710,convert_element_type_105,call_function,convert_element_type.default,forward,4,1,1,1,469,5386,4
-711,view_102,call_function,view.default,forward,4,1,1,1,470,5385,4
-712,view_as_complex_9,call_function,view_as_complex.default,forward,4,1,1,1,471,5384,6
-713,view_103,call_function,view.default,forward,4,1,1,1,2,5395,3
-714,alias_default_124,call_function,alias.default,forward,4,1,1,4,3,5394,3
-715,mul_30,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8
-716,view_as_real_8,call_function,view_as_real.default,forward,4,1,1,1,475,5382,6
-717,view_104,call_function,view.default,forward,4,1,1,1,476,5381,6
-718,mul_31,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8
-719,view_as_real_9,call_function,view_as_real.default,forward,4,1,1,1,475,5382,6
-720,view_105,call_function,view.default,forward,4,1,1,1,476,5381,6
-721,convert_element_type_106,call_function,convert_element_type.default,forward,4,1,1,1,477,5380,6
-722,convert_element_type_107,call_function,convert_element_type.default,forward,4,1,1,1,477,5380,6
-723,permute_47,call_function,permute.default,forward,4,1,1,1,478,5379,6
-724,permute_48,call_function,permute.default,forward,4,1,1,1,478,5379,6
-725,permute_49,call_function,permute.default,forward,4,1,1,1,469,5379,4
-726,alias_default_125,call_function,alias.default,forward,4,1,1,2,479,5378,4
-727,alias_default_126,call_function,alias.default,forward,4,1,1,2,479,5378,4
-728,alias_default_127,call_function,alias.default,forward,4,1,1,2,470,5378,4
-729,_scaled_dot_product_flash_attention_4,call_function,_scaled_dot_product_flash_attention.default,forward,4,3,3,4,503,5377,2
-730,getitem_36,call_function,getitem,forward,4,1,1,1,504,5373,2
-731,getitem_37,call_function,getitem,forward,4,1,1,1,504,504,2
-732,getitem_42,call_function,getitem,forward,4,1,1,1,504,504,1
-733,getitem_43,call_function,getitem,forward,4,1,1,1,504,504,1
-734,alias_default_128,call_function,alias.default,forward,4,1,1,2,505,5372,4
-735,permute_50,call_function,permute.default,forward,4,1,1,1,506,5371,4
-736,view_106,call_function,view.default,forward,4,1,1,1,507,5370,3
-737,dtype_cast_41,call_function,dtype_cast.default,forward,4,1,1,1,1,5372,3
-738,permute_51,call_function,permute.default,forward,4,1,1,1,2,5371,3
-739,alias_default_129,call_function,alias.default,forward,4,1,1,2,508,5369,4
-740,alias_default_130,call_function,alias.default,forward,4,1,1,2,3,5370,3
-741,einsum_default_31,call_function,einsum.default,forward,4,2,2,1,513,5368,5
-742,add_21,call_function,add.Tensor,forward,4,2,2,1,514,5367,10
-743,dtype_cast_42,call_function,dtype_cast.default,forward,4,1,1,1,1,5356,2
-744,alias_default_131,call_function,alias.default,forward,4,1,1,3,515,5366,4
-745,convert_element_type_110,call_function,convert_element_type.default,forward,4,1,1,1,516,5364,4
-746,alias_default_133,call_function,alias.default,forward,4,1,1,2,517,5363,4
-747,pow_10,call_function,pow.Tensor_Scalar,forward,4,1,1,1,518,5362,4
-748,mean_9,call_function,mean.dim,forward,4,1,1,1,519,5361,4
-749,add_22,call_function,add.Scalar,forward,4,1,1,1,520,5360,3
-750,rsqrt_9,call_function,rsqrt.default,forward,4,1,1,1,521,5359,3
-751,alias_default_134,call_function,alias.default,forward,4,1,1,3,522,5358,3
-752,mul_32,call_function,mul.Tensor,forward,4,2,2,1,523,5354,8
-753,alias_default_132,call_function,alias.default,forward,4,1,1,2,2,5355,2
-754,mul_33,call_function,mul.Tensor,forward,4,2,2,1,527,5353,8
-755,convert_element_type_111,call_function,convert_element_type.default,forward,4,1,1,1,528,5352,6
-756,dtype_cast_43,call_function,dtype_cast.default,forward,4,1,1,1,1,5352,3
-757,permute_52,call_function,permute.default,forward,4,1,1,1,2,5351,3
-758,alias_default_135,call_function,alias.default,forward,4,1,1,4,529,5351,4
-759,alias_default_136,call_function,alias.default,forward,4,1,1,2,3,5350,3
-760,einsum_default_32,call_function,einsum.default,forward,4,2,2,1,534,5348,5
-761,alias_default_137,call_function,alias.default,forward,4,1,1,2,535,5347,4
-762,convert_element_type_114,call_function,convert_element_type.default,forward,4,1,1,1,536,5335,4
-763,alias_default_138,call_function,alias.default,forward,4,1,1,2,537,5334,4
-764,neg_4,call_function,neg.default,forward,4,1,1,1,538,5333,8
-765,exp_4,call_function,exp.default,forward,4,1,1,1,539,5332,6
-766,add_23,call_function,add.Tensor,forward,4,1,1,1,540,5331,4
-767,div_4,call_function,div.Tensor,forward,4,2,2,1,541,5330,6
-768,convert_element_type_115,call_function,convert_element_type.default,forward,4,1,1,1,542,5329,6
-769,dtype_cast_44,call_function,dtype_cast.default,forward,4,1,1,1,1,5333,3
-770,permute_53,call_function,permute.default,forward,4,1,1,1,2,5332,3
-771,alias_default_140,call_function,alias.default,forward,4,1,1,2,3,5331,3
-772,einsum_default_33,call_function,einsum.default,forward,4,2,2,1,534,5329,5
-773,alias_default_139,call_function,alias.default,forward,4,1,1,2,543,5328,4
-774,alias_default_141,call_function,alias.default,forward,4,1,1,2,535,5328,4
-775,mul_34,call_function,mul.Tensor,forward,4,2,2,1,550,5327,8
-776,dtype_cast_45,call_function,dtype_cast.default,forward,4,1,1,1,1,5329,3
-777,permute_54,call_function,permute.default,forward,4,1,1,1,2,5328,3
-778,alias_default_142,call_function,alias.default,forward,4,1,1,2,551,5326,4
-779,alias_default_143,call_function,alias.default,forward,4,1,1,2,3,5327,3
-780,einsum_default_34,call_function,einsum.default,forward,4,2,2,1,556,5325,5
-781,add_24,call_function,add.Tensor,forward,4,2,2,1,557,5324,10
-782,dtype_cast_46,call_function,dtype_cast.default,forward,5,1,1,1,1,5313,2
-783,alias_default_144,call_function,alias.default,forward,4,1,1,3,558,5323,4
-784,convert_element_type_120,call_function,convert_element_type.default,forward,5,1,1,1,559,5321,4
-785,alias_default_146,call_function,alias.default,forward,5,1,1,2,560,5320,4
-786,pow_11,call_function,pow.Tensor_Scalar,forward,5,1,1,1,561,5319,4
-787,mean_10,call_function,mean.dim,forward,5,1,1,1,562,5318,4
-788,add_25,call_function,add.Scalar,forward,5,1,1,1,563,5317,3
-789,rsqrt_10,call_function,rsqrt.default,forward,5,1,1,1,564,5316,3
-790,alias_default_147,call_function,alias.default,forward,5,1,1,3,565,5315,3
-791,mul_35,call_function,mul.Tensor,forward,5,2,2,1,566,5311,8
-792,alias_default_145,call_function,alias.default,forward,5,1,1,2,2,5312,2
-793,mul_36,call_function,mul.Tensor,forward,5,2,2,1,570,5310,8
-794,convert_element_type_121,call_function,convert_element_type.default,forward,5,1,1,1,571,5309,6
-795,dtype_cast_47,call_function,dtype_cast.default,forward,5,1,1,1,1,5296,3
-796,permute_55,call_function,permute.default,forward,5,1,1,1,2,5295,3
-797,alias_default_148,call_function,alias.default,forward,5,1,1,6,572,5308,4
-798,alias_default_149,call_function,alias.default,forward,5,1,1,2,3,5294,3
-799,einsum_default_35,call_function,einsum.default,forward,5,2,2,1,577,5292,5
-800,dtype_cast_48,call_function,dtype_cast.default,forward,5,1,1,1,1,5296,3
-801,permute_56,call_function,permute.default,forward,5,1,1,1,2,5295,3
-802,alias_default_150,call_function,alias.default,forward,5,1,1,2,3,5294,3
-803,einsum_default_36,call_function,einsum.default,forward,5,2,2,1,577,5292,5
-804,dtype_cast_49,call_function,dtype_cast.default,forward,5,1,1,1,1,5289,3
-805,permute_57,call_function,permute.default,forward,5,1,1,1,2,5288,3
-806,alias_default_151,call_function,alias.default,forward,5,1,1,2,3,5287,3
-807,einsum_default_37,call_function,einsum.default,forward,5,2,2,1,577,5285,5
-808,view_121,call_function,view.default,forward,5,1,1,1,578,5291,4
-809,view_122,call_function,view.default,forward,5,1,1,1,578,5291,4
-810,view_123,call_function,view.default,forward,5,1,1,1,578,5284,4
-811,convert_element_type_128,call_function,convert_element_type.default,forward,5,1,1,1,579,5290,4
-812,view_124,call_function,view.default,forward,5,1,1,1,580,5289,4
-813,view_as_complex_10,call_function,view_as_complex.default,forward,5,1,1,1,581,5288,6
-814,convert_element_type_129,call_function,convert_element_type.default,forward,5,1,1,1,579,5290,4
-815,view_125,call_function,view.default,forward,5,1,1,1,580,5289,4
-816,view_as_complex_11,call_function,view_as_complex.default,forward,5,1,1,1,581,5288,6
-817,view_126,call_function,view.default,forward,5,1,1,1,2,5299,3
-818,alias_default_152,call_function,alias.default,forward,5,1,1,4,3,5298,3
-819,mul_37,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8
-820,view_as_real_10,call_function,view_as_real.default,forward,5,1,1,1,585,5286,6
-821,view_127,call_function,view.default,forward,5,1,1,1,586,5285,6
-822,mul_38,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8
-823,view_as_real_11,call_function,view_as_real.default,forward,5,1,1,1,585,5286,6
-824,view_128,call_function,view.default,forward,5,1,1,1,586,5285,6
-825,convert_element_type_130,call_function,convert_element_type.default,forward,5,1,1,1,587,5284,6
-826,convert_element_type_131,call_function,convert_element_type.default,forward,5,1,1,1,587,5284,6
-827,permute_58,call_function,permute.default,forward,5,1,1,1,588,5283,6
-828,permute_59,call_function,permute.default,forward,5,1,1,1,588,5283,6
-829,permute_60,call_function,permute.default,forward,5,1,1,1,579,5283,4
-830,alias_default_153,call_function,alias.default,forward,5,1,1,2,589,5282,4
-831,alias_default_154,call_function,alias.default,forward,5,1,1,2,589,5282,4
-832,alias_default_155,call_function,alias.default,forward,5,1,1,2,580,5282,4
-833,_scaled_dot_product_flash_attention_5,call_function,_scaled_dot_product_flash_attention.default,forward,5,3,3,4,613,5281,2
-834,getitem_45,call_function,getitem,forward,5,1,1,1,614,5277,2
-835,getitem_46,call_function,getitem,forward,5,1,1,1,614,614,2
-836,getitem_51,call_function,getitem,forward,5,1,1,1,614,614,1
-837,getitem_52,call_function,getitem,forward,5,1,1,1,614,614,1
-838,alias_default_156,call_function,alias.default,forward,5,1,1,2,615,5276,4
-839,permute_61,call_function,permute.default,forward,5,1,1,1,616,5275,4
-840,view_129,call_function,view.default,forward,5,1,1,1,617,5274,3
-841,dtype_cast_50,call_function,dtype_cast.default,forward,5,1,1,1,1,5276,3
-842,permute_62,call_function,permute.default,forward,5,1,1,1,2,5275,3
-843,alias_default_157,call_function,alias.default,forward,5,1,1,2,618,5273,4
-844,alias_default_158,call_function,alias.default,forward,5,1,1,2,3,5274,3
-845,einsum_default_38,call_function,einsum.default,forward,5,2,2,1,623,5272,5
-846,add_26,call_function,add.Tensor,forward,5,2,2,1,624,5271,10
-847,dtype_cast_51,call_function,dtype_cast.default,forward,5,1,1,1,1,5260,2
-848,alias_default_159,call_function,alias.default,forward,5,1,1,3,625,5270,4
-849,convert_element_type_134,call_function,convert_element_type.default,forward,5,1,1,1,626,5268,4
-850,alias_default_161,call_function,alias.default,forward,5,1,1,2,627,5267,4
-851,pow_12,call_function,pow.Tensor_Scalar,forward,5,1,1,1,628,5266,4
-852,mean_11,call_function,mean.dim,forward,5,1,1,1,629,5265,4
-853,add_27,call_function,add.Scalar,forward,5,1,1,1,630,5264,3
-854,rsqrt_11,call_function,rsqrt.default,forward,5,1,1,1,631,5263,3
-855,alias_default_162,call_function,alias.default,forward,5,1,1,3,632,5262,3
-856,mul_39,call_function,mul.Tensor,forward,5,2,2,1,633,5258,8
-857,alias_default_160,call_function,alias.default,forward,5,1,1,2,2,5259,2
-858,mul_40,call_function,mul.Tensor,forward,5,2,2,1,637,5257,8
-859,convert_element_type_135,call_function,convert_element_type.default,forward,5,1,1,1,638,5256,6
-860,dtype_cast_52,call_function,dtype_cast.default,forward,5,1,1,1,1,5256,3
-861,permute_63,call_function,permute.default,forward,5,1,1,1,2,5255,3
-862,alias_default_163,call_function,alias.default,forward,5,1,1,4,639,5255,4
-863,alias_default_164,call_function,alias.default,forward,5,1,1,2,3,5254,3
-864,einsum_default_39,call_function,einsum.default,forward,5,2,2,1,644,5252,5
-865,alias_default_165,call_function,alias.default,forward,5,1,1,2,645,5251,4
-866,convert_element_type_138,call_function,convert_element_type.default,forward,5,1,1,1,646,5239,4
-867,alias_default_166,call_function,alias.default,forward,5,1,1,2,647,5238,4
-868,neg_5,call_function,neg.default,forward,5,1,1,1,648,5237,8
-869,exp_5,call_function,exp.default,forward,5,1,1,1,649,5236,6
-870,add_28,call_function,add.Tensor,forward,5,1,1,1,650,5235,4
-871,div_5,call_function,div.Tensor,forward,5,2,2,1,651,5234,6
-872,convert_element_type_139,call_function,convert_element_type.default,forward,5,1,1,1,652,5233,6
-873,dtype_cast_53,call_function,dtype_cast.default,forward,5,1,1,1,1,5237,3
-874,permute_64,call_function,permute.default,forward,5,1,1,1,2,5236,3
-875,alias_default_168,call_function,alias.default,forward,5,1,1,2,3,5235,3
-876,einsum_default_40,call_function,einsum.default,forward,5,2,2,1,644,5233,5
-877,alias_default_167,call_function,alias.default,forward,5,1,1,2,653,5232,4
-878,alias_default_169,call_function,alias.default,forward,5,1,1,2,645,5232,4
-879,mul_41,call_function,mul.Tensor,forward,5,2,2,1,660,5231,8
-880,dtype_cast_54,call_function,dtype_cast.default,forward,5,1,1,1,1,5233,3
-881,permute_65,call_function,permute.default,forward,5,1,1,1,2,5232,3
-882,alias_default_170,call_function,alias.default,forward,5,1,1,2,661,5230,4
-883,alias_default_171,call_function,alias.default,forward,5,1,1,2,3,5231,3
-884,einsum_default_41,call_function,einsum.default,forward,5,2,2,1,666,5229,5
-885,add_29,call_function,add.Tensor,forward,5,2,2,1,667,5228,10
-886,dtype_cast_55,call_function,dtype_cast.default,forward,6,1,1,1,1,5217,2
-887,alias_default_172,call_function,alias.default,forward,5,1,1,3,668,5227,4
-888,convert_element_type_144,call_function,convert_element_type.default,forward,6,1,1,1,669,5225,4
-889,alias_default_174,call_function,alias.default,forward,6,1,1,2,670,5224,4
-890,pow_13,call_function,pow.Tensor_Scalar,forward,6,1,1,1,671,5223,4
-891,mean_12,call_function,mean.dim,forward,6,1,1,1,672,5222,4
-892,add_30,call_function,add.Scalar,forward,6,1,1,1,673,5221,3
-893,rsqrt_12,call_function,rsqrt.default,forward,6,1,1,1,674,5220,3
-894,alias_default_175,call_function,alias.default,forward,6,1,1,3,675,5219,3
-895,mul_42,call_function,mul.Tensor,forward,6,2,2,1,676,5215,8
-896,alias_default_173,call_function,alias.default,forward,6,1,1,2,2,5216,2
-897,mul_43,call_function,mul.Tensor,forward,6,2,2,1,680,5214,8
-898,convert_element_type_145,call_function,convert_element_type.default,forward,6,1,1,1,681,5213,6
-899,dtype_cast_56,call_function,dtype_cast.default,forward,6,1,1,1,1,5200,3
-900,permute_66,call_function,permute.default,forward,6,1,1,1,2,5199,3
-901,alias_default_176,call_function,alias.default,forward,6,1,1,6,682,5212,4
-902,alias_default_177,call_function,alias.default,forward,6,1,1,2,3,5198,3
-903,einsum_default_42,call_function,einsum.default,forward,6,2,2,1,687,5196,5
-904,dtype_cast_57,call_function,dtype_cast.default,forward,6,1,1,1,1,5200,3
-905,permute_67,call_function,permute.default,forward,6,1,1,1,2,5199,3
-906,alias_default_178,call_function,alias.default,forward,6,1,1,2,3,5198,3
-907,einsum_default_43,call_function,einsum.default,forward,6,2,2,1,687,5196,5
-908,dtype_cast_58,call_function,dtype_cast.default,forward,6,1,1,1,1,5193,3
-909,permute_68,call_function,permute.default,forward,6,1,1,1,2,5192,3
-910,alias_default_179,call_function,alias.default,forward,6,1,1,2,3,5191,3
-911,einsum_default_44,call_function,einsum.default,forward,6,2,2,1,687,5189,5
-912,view_144,call_function,view.default,forward,6,1,1,1,688,5195,4
-913,view_145,call_function,view.default,forward,6,1,1,1,688,5195,4
-914,view_146,call_function,view.default,forward,6,1,1,1,688,5188,4
-915,convert_element_type_152,call_function,convert_element_type.default,forward,6,1,1,1,689,5194,4
-916,view_147,call_function,view.default,forward,6,1,1,1,690,5193,4
-917,view_as_complex_12,call_function,view_as_complex.default,forward,6,1,1,1,691,5192,6
-918,convert_element_type_153,call_function,convert_element_type.default,forward,6,1,1,1,689,5194,4
-919,view_148,call_function,view.default,forward,6,1,1,1,690,5193,4
-920,view_as_complex_13,call_function,view_as_complex.default,forward,6,1,1,1,691,5192,6
-921,view_149,call_function,view.default,forward,6,1,1,1,2,5203,3
-922,alias_default_180,call_function,alias.default,forward,6,1,1,4,3,5202,3
-923,mul_44,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8
-924,view_as_real_12,call_function,view_as_real.default,forward,6,1,1,1,695,5190,6
-925,view_150,call_function,view.default,forward,6,1,1,1,696,5189,6
-926,mul_45,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8
-927,view_as_real_13,call_function,view_as_real.default,forward,6,1,1,1,695,5190,6
-928,view_151,call_function,view.default,forward,6,1,1,1,696,5189,6
-929,convert_element_type_154,call_function,convert_element_type.default,forward,6,1,1,1,697,5188,6
-930,convert_element_type_155,call_function,convert_element_type.default,forward,6,1,1,1,697,5188,6
-931,permute_69,call_function,permute.default,forward,6,1,1,1,698,5187,6
-932,permute_70,call_function,permute.default,forward,6,1,1,1,698,5187,6
-933,permute_71,call_function,permute.default,forward,6,1,1,1,689,5187,4
-934,alias_default_181,call_function,alias.default,forward,6,1,1,2,699,5186,4
-935,alias_default_182,call_function,alias.default,forward,6,1,1,2,699,5186,4
-936,alias_default_183,call_function,alias.default,forward,6,1,1,2,690,5186,4
-937,_scaled_dot_product_flash_attention_6,call_function,_scaled_dot_product_flash_attention.default,forward,6,3,3,4,723,5185,2
-938,getitem_54,call_function,getitem,forward,6,1,1,1,724,5181,2
-939,getitem_55,call_function,getitem,forward,6,1,1,1,724,724,2
-940,getitem_60,call_function,getitem,forward,6,1,1,1,724,724,1
-941,getitem_61,call_function,getitem,forward,6,1,1,1,724,724,1
-942,alias_default_184,call_function,alias.default,forward,6,1,1,2,725,5180,4
-943,permute_72,call_function,permute.default,forward,6,1,1,1,726,5179,4
-944,view_152,call_function,view.default,forward,6,1,1,1,727,5178,3
-945,dtype_cast_59,call_function,dtype_cast.default,forward,6,1,1,1,1,5180,3
-946,permute_73,call_function,permute.default,forward,6,1,1,1,2,5179,3
-947,alias_default_185,call_function,alias.default,forward,6,1,1,2,728,5177,4
-948,alias_default_186,call_function,alias.default,forward,6,1,1,2,3,5178,3
-949,einsum_default_45,call_function,einsum.default,forward,6,2,2,1,733,5176,5
-950,add_31,call_function,add.Tensor,forward,6,2,2,1,734,5175,10
-951,dtype_cast_60,call_function,dtype_cast.default,forward,6,1,1,1,1,5164,2
-952,alias_default_187,call_function,alias.default,forward,6,1,1,3,735,5174,4
-953,convert_element_type_158,call_function,convert_element_type.default,forward,6,1,1,1,736,5172,4
-954,alias_default_189,call_function,alias.default,forward,6,1,1,2,737,5171,4
-955,pow_14,call_function,pow.Tensor_Scalar,forward,6,1,1,1,738,5170,4
-956,mean_13,call_function,mean.dim,forward,6,1,1,1,739,5169,4
-957,add_32,call_function,add.Scalar,forward,6,1,1,1,740,5168,3
-958,rsqrt_13,call_function,rsqrt.default,forward,6,1,1,1,741,5167,3
-959,alias_default_190,call_function,alias.default,forward,6,1,1,3,742,5166,3
-960,mul_46,call_function,mul.Tensor,forward,6,2,2,1,743,5162,8
-961,alias_default_188,call_function,alias.default,forward,6,1,1,2,2,5163,2
-962,mul_47,call_function,mul.Tensor,forward,6,2,2,1,747,5161,8
-963,convert_element_type_159,call_function,convert_element_type.default,forward,6,1,1,1,748,5160,6
-964,dtype_cast_61,call_function,dtype_cast.default,forward,6,1,1,1,1,5160,3
-965,permute_74,call_function,permute.default,forward,6,1,1,1,2,5159,3
-966,alias_default_191,call_function,alias.default,forward,6,1,1,4,749,5159,4
-967,alias_default_192,call_function,alias.default,forward,6,1,1,2,3,5158,3
-968,einsum_default_46,call_function,einsum.default,forward,6,2,2,1,754,5156,5
-969,alias_default_193,call_function,alias.default,forward,6,1,1,2,755,5155,4
-970,convert_element_type_162,call_function,convert_element_type.default,forward,6,1,1,1,756,5143,4
-971,alias_default_194,call_function,alias.default,forward,6,1,1,2,757,5142,4
-972,neg_6,call_function,neg.default,forward,6,1,1,1,758,5141,8
-973,exp_6,call_function,exp.default,forward,6,1,1,1,759,5140,6
-974,add_33,call_function,add.Tensor,forward,6,1,1,1,760,5139,4
-975,div_6,call_function,div.Tensor,forward,6,2,2,1,761,5138,6
-976,convert_element_type_163,call_function,convert_element_type.default,forward,6,1,1,1,762,5137,6
-977,dtype_cast_62,call_function,dtype_cast.default,forward,6,1,1,1,1,5141,3
-978,permute_75,call_function,permute.default,forward,6,1,1,1,2,5140,3
-979,alias_default_196,call_function,alias.default,forward,6,1,1,2,3,5139,3
-980,einsum_default_47,call_function,einsum.default,forward,6,2,2,1,754,5137,5
-981,alias_default_195,call_function,alias.default,forward,6,1,1,2,763,5136,4
-982,alias_default_197,call_function,alias.default,forward,6,1,1,2,755,5136,4
-983,mul_48,call_function,mul.Tensor,forward,6,2,2,1,770,5135,8
-984,dtype_cast_63,call_function,dtype_cast.default,forward,6,1,1,1,1,5137,3
-985,permute_76,call_function,permute.default,forward,6,1,1,1,2,5136,3
-986,alias_default_198,call_function,alias.default,forward,6,1,1,2,771,5134,4
-987,alias_default_199,call_function,alias.default,forward,6,1,1,2,3,5135,3
-988,einsum_default_48,call_function,einsum.default,forward,6,2,2,1,776,5133,5
-989,add_34,call_function,add.Tensor,forward,6,2,2,1,777,5132,10
-990,dtype_cast_64,call_function,dtype_cast.default,forward,7,1,1,1,1,5121,2
-991,alias_default_200,call_function,alias.default,forward,6,1,1,3,778,5131,4
-992,convert_element_type_168,call_function,convert_element_type.default,forward,7,1,1,1,779,5129,4
-993,alias_default_202,call_function,alias.default,forward,7,1,1,2,780,5128,4
-994,pow_15,call_function,pow.Tensor_Scalar,forward,7,1,1,1,781,5127,4
-995,mean_14,call_function,mean.dim,forward,7,1,1,1,782,5126,4
-996,add_35,call_function,add.Scalar,forward,7,1,1,1,783,5125,3
-997,rsqrt_14,call_function,rsqrt.default,forward,7,1,1,1,784,5124,3
-998,alias_default_203,call_function,alias.default,forward,7,1,1,3,785,5123,3
-999,mul_49,call_function,mul.Tensor,forward,7,2,2,1,786,5119,8
-1000,alias_default_201,call_function,alias.default,forward,7,1,1,2,2,5120,2
-1001,mul_50,call_function,mul.Tensor,forward,7,2,2,1,790,5118,8
-1002,convert_element_type_169,call_function,convert_element_type.default,forward,7,1,1,1,791,5117,6
-1003,dtype_cast_65,call_function,dtype_cast.default,forward,7,1,1,1,1,5104,3
-1004,permute_77,call_function,permute.default,forward,7,1,1,1,2,5103,3
-1005,alias_default_204,call_function,alias.default,forward,7,1,1,6,792,5116,4
-1006,alias_default_205,call_function,alias.default,forward,7,1,1,2,3,5102,3
-1007,einsum_default_49,call_function,einsum.default,forward,7,2,2,1,797,5100,5
-1008,dtype_cast_66,call_function,dtype_cast.default,forward,7,1,1,1,1,5104,3
-1009,permute_78,call_function,permute.default,forward,7,1,1,1,2,5103,3
-1010,alias_default_206,call_function,alias.default,forward,7,1,1,2,3,5102,3
-1011,einsum_default_50,call_function,einsum.default,forward,7,2,2,1,797,5100,5
-1012,dtype_cast_67,call_function,dtype_cast.default,forward,7,1,1,1,1,5097,3
-1013,permute_79,call_function,permute.default,forward,7,1,1,1,2,5096,3
-1014,alias_default_207,call_function,alias.default,forward,7,1,1,2,3,5095,3
-1015,einsum_default_51,call_function,einsum.default,forward,7,2,2,1,797,5093,5
-1016,view_167,call_function,view.default,forward,7,1,1,1,798,5099,4
-1017,view_168,call_function,view.default,forward,7,1,1,1,798,5099,4
-1018,view_169,call_function,view.default,forward,7,1,1,1,798,5092,4
-1019,convert_element_type_176,call_function,convert_element_type.default,forward,7,1,1,1,799,5098,4
-1020,view_170,call_function,view.default,forward,7,1,1,1,800,5097,4
-1021,view_as_complex_14,call_function,view_as_complex.default,forward,7,1,1,1,801,5096,6
-1022,convert_element_type_177,call_function,convert_element_type.default,forward,7,1,1,1,799,5098,4
-1023,view_171,call_function,view.default,forward,7,1,1,1,800,5097,4
-1024,view_as_complex_15,call_function,view_as_complex.default,forward,7,1,1,1,801,5096,6
-1025,view_172,call_function,view.default,forward,7,1,1,1,2,5107,3
-1026,alias_default_208,call_function,alias.default,forward,7,1,1,4,3,5106,3
-1027,mul_51,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8
-1028,view_as_real_14,call_function,view_as_real.default,forward,7,1,1,1,805,5094,6
-1029,view_173,call_function,view.default,forward,7,1,1,1,806,5093,6
-1030,mul_52,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8
-1031,view_as_real_15,call_function,view_as_real.default,forward,7,1,1,1,805,5094,6
-1032,view_174,call_function,view.default,forward,7,1,1,1,806,5093,6
-1033,convert_element_type_178,call_function,convert_element_type.default,forward,7,1,1,1,807,5092,6
-1034,convert_element_type_179,call_function,convert_element_type.default,forward,7,1,1,1,807,5092,6
-1035,permute_80,call_function,permute.default,forward,7,1,1,1,808,5091,6
-1036,permute_81,call_function,permute.default,forward,7,1,1,1,808,5091,6
-1037,permute_82,call_function,permute.default,forward,7,1,1,1,799,5091,4
-1038,alias_default_209,call_function,alias.default,forward,7,1,1,2,809,5090,4
-1039,alias_default_210,call_function,alias.default,forward,7,1,1,2,809,5090,4
-1040,alias_default_211,call_function,alias.default,forward,7,1,1,2,800,5090,4
-1041,_scaled_dot_product_flash_attention_7,call_function,_scaled_dot_product_flash_attention.default,forward,7,3,3,4,833,5089,2
-1042,getitem_63,call_function,getitem,forward,7,1,1,1,834,5085,2
-1043,getitem_64,call_function,getitem,forward,7,1,1,1,834,834,2
-1044,getitem_69,call_function,getitem,forward,7,1,1,1,834,834,1
-1045,getitem_70,call_function,getitem,forward,7,1,1,1,834,834,1
-1046,alias_default_212,call_function,alias.default,forward,7,1,1,2,835,5084,4
-1047,permute_83,call_function,permute.default,forward,7,1,1,1,836,5083,4
-1048,view_175,call_function,view.default,forward,7,1,1,1,837,5082,3
-1049,dtype_cast_68,call_function,dtype_cast.default,forward,7,1,1,1,1,5084,3
-1050,permute_84,call_function,permute.default,forward,7,1,1,1,2,5083,3
-1051,alias_default_213,call_function,alias.default,forward,7,1,1,2,838,5081,4
-1052,alias_default_214,call_function,alias.default,forward,7,1,1,2,3,5082,3
-1053,einsum_default_52,call_function,einsum.default,forward,7,2,2,1,843,5080,5
-1054,add_36,call_function,add.Tensor,forward,7,2,2,1,844,5079,10
-1055,dtype_cast_69,call_function,dtype_cast.default,forward,7,1,1,1,1,5068,2
-1056,alias_default_215,call_function,alias.default,forward,7,1,1,3,845,5078,4
-1057,convert_element_type_182,call_function,convert_element_type.default,forward,7,1,1,1,846,5076,4
-1058,alias_default_217,call_function,alias.default,forward,7,1,1,2,847,5075,4
-1059,pow_16,call_function,pow.Tensor_Scalar,forward,7,1,1,1,848,5074,4
-1060,mean_15,call_function,mean.dim,forward,7,1,1,1,849,5073,4
-1061,add_37,call_function,add.Scalar,forward,7,1,1,1,850,5072,3
-1062,rsqrt_15,call_function,rsqrt.default,forward,7,1,1,1,851,5071,3
-1063,alias_default_218,call_function,alias.default,forward,7,1,1,3,852,5070,3
-1064,mul_53,call_function,mul.Tensor,forward,7,2,2,1,853,5066,8
-1065,alias_default_216,call_function,alias.default,forward,7,1,1,2,2,5067,2
-1066,mul_54,call_function,mul.Tensor,forward,7,2,2,1,857,5065,8
-1067,convert_element_type_183,call_function,convert_element_type.default,forward,7,1,1,1,858,5064,6
-1068,dtype_cast_70,call_function,dtype_cast.default,forward,7,1,1,1,1,5064,3
-1069,permute_85,call_function,permute.default,forward,7,1,1,1,2,5063,3
-1070,alias_default_219,call_function,alias.default,forward,7,1,1,4,859,5063,4
-1071,alias_default_220,call_function,alias.default,forward,7,1,1,2,3,5062,3
-1072,einsum_default_53,call_function,einsum.default,forward,7,2,2,1,864,5060,5
-1073,alias_default_221,call_function,alias.default,forward,7,1,1,2,865,5059,4
-1074,convert_element_type_186,call_function,convert_element_type.default,forward,7,1,1,1,866,5047,4
-1075,alias_default_222,call_function,alias.default,forward,7,1,1,2,867,5046,4
-1076,neg_7,call_function,neg.default,forward,7,1,1,1,868,5045,8
-1077,exp_7,call_function,exp.default,forward,7,1,1,1,869,5044,6
-1078,add_38,call_function,add.Tensor,forward,7,1,1,1,870,5043,4
-1079,div_7,call_function,div.Tensor,forward,7,2,2,1,871,5042,6
-1080,convert_element_type_187,call_function,convert_element_type.default,forward,7,1,1,1,872,5041,6
-1081,dtype_cast_71,call_function,dtype_cast.default,forward,7,1,1,1,1,5045,3
-1082,permute_86,call_function,permute.default,forward,7,1,1,1,2,5044,3
-1083,alias_default_224,call_function,alias.default,forward,7,1,1,2,3,5043,3
-1084,einsum_default_54,call_function,einsum.default,forward,7,2,2,1,864,5041,5
-1085,alias_default_223,call_function,alias.default,forward,7,1,1,2,873,5040,4
-1086,alias_default_225,call_function,alias.default,forward,7,1,1,2,865,5040,4
-1087,mul_55,call_function,mul.Tensor,forward,7,2,2,1,880,5039,8
-1088,dtype_cast_72,call_function,dtype_cast.default,forward,7,1,1,1,1,5041,3
-1089,permute_87,call_function,permute.default,forward,7,1,1,1,2,5040,3
-1090,alias_default_226,call_function,alias.default,forward,7,1,1,2,881,5038,4
-1091,alias_default_227,call_function,alias.default,forward,7,1,1,2,3,5039,3
-1092,einsum_default_55,call_function,einsum.default,forward,7,2,2,1,886,5037,5
-1093,add_39,call_function,add.Tensor,forward,7,2,2,1,887,5036,10
-1094,dtype_cast_73,call_function,dtype_cast.default,forward,8,1,1,1,1,5025,2
-1095,alias_default_228,call_function,alias.default,forward,7,1,1,3,888,5035,4
-1096,convert_element_type_192,call_function,convert_element_type.default,forward,8,1,1,1,889,5033,4
-1097,alias_default_230,call_function,alias.default,forward,8,1,1,2,890,5032,4
-1098,pow_17,call_function,pow.Tensor_Scalar,forward,8,1,1,1,891,5031,4
-1099,mean_16,call_function,mean.dim,forward,8,1,1,1,892,5030,4
-1100,add_40,call_function,add.Scalar,forward,8,1,1,1,893,5029,3
-1101,rsqrt_16,call_function,rsqrt.default,forward,8,1,1,1,894,5028,3
-1102,alias_default_231,call_function,alias.default,forward,8,1,1,3,895,5027,3
-1103,mul_56,call_function,mul.Tensor,forward,8,2,2,1,896,5023,8
-1104,alias_default_229,call_function,alias.default,forward,8,1,1,2,2,5024,2
-1105,mul_57,call_function,mul.Tensor,forward,8,2,2,1,900,5022,8
-1106,convert_element_type_193,call_function,convert_element_type.default,forward,8,1,1,1,901,5021,6
-1107,dtype_cast_74,call_function,dtype_cast.default,forward,8,1,1,1,1,5008,3
-1108,permute_88,call_function,permute.default,forward,8,1,1,1,2,5007,3
-1109,alias_default_232,call_function,alias.default,forward,8,1,1,6,902,5020,4
-1110,alias_default_233,call_function,alias.default,forward,8,1,1,2,3,5006,3
-1111,einsum_default_56,call_function,einsum.default,forward,8,2,2,1,907,5004,5
-1112,dtype_cast_75,call_function,dtype_cast.default,forward,8,1,1,1,1,5008,3
-1113,permute_89,call_function,permute.default,forward,8,1,1,1,2,5007,3
-1114,alias_default_234,call_function,alias.default,forward,8,1,1,2,3,5006,3
-1115,einsum_default_57,call_function,einsum.default,forward,8,2,2,1,907,5004,5
-1116,dtype_cast_76,call_function,dtype_cast.default,forward,8,1,1,1,1,5001,3
-1117,permute_90,call_function,permute.default,forward,8,1,1,1,2,5000,3
-1118,alias_default_235,call_function,alias.default,forward,8,1,1,2,3,4999,3
-1119,einsum_default_58,call_function,einsum.default,forward,8,2,2,1,907,4997,5
-1120,view_190,call_function,view.default,forward,8,1,1,1,908,5003,4
-1121,view_191,call_function,view.default,forward,8,1,1,1,908,5003,4
-1122,view_192,call_function,view.default,forward,8,1,1,1,908,4996,4
-1123,convert_element_type_200,call_function,convert_element_type.default,forward,8,1,1,1,909,5002,4
-1124,view_193,call_function,view.default,forward,8,1,1,1,910,5001,4
-1125,view_as_complex_16,call_function,view_as_complex.default,forward,8,1,1,1,911,5000,6
-1126,convert_element_type_201,call_function,convert_element_type.default,forward,8,1,1,1,909,5002,4
-1127,view_194,call_function,view.default,forward,8,1,1,1,910,5001,4
-1128,view_as_complex_17,call_function,view_as_complex.default,forward,8,1,1,1,911,5000,6
-1129,view_195,call_function,view.default,forward,8,1,1,1,2,5011,3
-1130,alias_default_236,call_function,alias.default,forward,8,1,1,4,3,5010,3
-1131,mul_58,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8
-1132,view_as_real_16,call_function,view_as_real.default,forward,8,1,1,1,915,4998,6
-1133,view_196,call_function,view.default,forward,8,1,1,1,916,4997,6
-1134,mul_59,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8
-1135,view_as_real_17,call_function,view_as_real.default,forward,8,1,1,1,915,4998,6
-1136,view_197,call_function,view.default,forward,8,1,1,1,916,4997,6
-1137,convert_element_type_202,call_function,convert_element_type.default,forward,8,1,1,1,917,4996,6
-1138,convert_element_type_203,call_function,convert_element_type.default,forward,8,1,1,1,917,4996,6
-1139,permute_91,call_function,permute.default,forward,8,1,1,1,918,4995,6
-1140,permute_92,call_function,permute.default,forward,8,1,1,1,918,4995,6
-1141,permute_93,call_function,permute.default,forward,8,1,1,1,909,4995,4
-1142,alias_default_237,call_function,alias.default,forward,8,1,1,2,919,4994,4
-1143,alias_default_238,call_function,alias.default,forward,8,1,1,2,919,4994,4
-1144,alias_default_239,call_function,alias.default,forward,8,1,1,2,910,4994,4
-1145,_scaled_dot_product_flash_attention_8,call_function,_scaled_dot_product_flash_attention.default,forward,8,3,3,4,943,4993,2
-1146,getitem_72,call_function,getitem,forward,8,1,1,1,944,4989,2
-1147,getitem_73,call_function,getitem,forward,8,1,1,1,944,944,2
-1148,getitem_78,call_function,getitem,forward,8,1,1,1,944,944,1
-1149,getitem_79,call_function,getitem,forward,8,1,1,1,944,944,1
-1150,alias_default_240,call_function,alias.default,forward,8,1,1,2,945,4988,4
-1151,permute_94,call_function,permute.default,forward,8,1,1,1,946,4987,4
-1152,view_198,call_function,view.default,forward,8,1,1,1,947,4986,3
-1153,dtype_cast_77,call_function,dtype_cast.default,forward,8,1,1,1,1,4988,3
-1154,permute_95,call_function,permute.default,forward,8,1,1,1,2,4987,3
-1155,alias_default_241,call_function,alias.default,forward,8,1,1,2,948,4985,4
-1156,alias_default_242,call_function,alias.default,forward,8,1,1,2,3,4986,3
-1157,einsum_default_59,call_function,einsum.default,forward,8,2,2,1,953,4984,5
-1158,add_41,call_function,add.Tensor,forward,8,2,2,1,954,4983,10
-1159,dtype_cast_78,call_function,dtype_cast.default,forward,8,1,1,1,1,4972,2
-1160,alias_default_243,call_function,alias.default,forward,8,1,1,3,955,4982,4
-1161,convert_element_type_206,call_function,convert_element_type.default,forward,8,1,1,1,956,4980,4
-1162,alias_default_245,call_function,alias.default,forward,8,1,1,2,957,4979,4
-1163,pow_18,call_function,pow.Tensor_Scalar,forward,8,1,1,1,958,4978,4
-1164,mean_17,call_function,mean.dim,forward,8,1,1,1,959,4977,4
-1165,add_42,call_function,add.Scalar,forward,8,1,1,1,960,4976,3
-1166,rsqrt_17,call_function,rsqrt.default,forward,8,1,1,1,961,4975,3
-1167,alias_default_246,call_function,alias.default,forward,8,1,1,3,962,4974,3
-1168,mul_60,call_function,mul.Tensor,forward,8,2,2,1,963,4970,8
-1169,alias_default_244,call_function,alias.default,forward,8,1,1,2,2,4971,2
-1170,mul_61,call_function,mul.Tensor,forward,8,2,2,1,967,4969,8
-1171,convert_element_type_207,call_function,convert_element_type.default,forward,8,1,1,1,968,4968,6
-1172,dtype_cast_79,call_function,dtype_cast.default,forward,8,1,1,1,1,4968,3
-1173,permute_96,call_function,permute.default,forward,8,1,1,1,2,4967,3
-1174,alias_default_247,call_function,alias.default,forward,8,1,1,4,969,4967,4
-1175,alias_default_248,call_function,alias.default,forward,8,1,1,2,3,4966,3
-1176,einsum_default_60,call_function,einsum.default,forward,8,2,2,1,974,4964,5
-1177,alias_default_249,call_function,alias.default,forward,8,1,1,2,975,4963,4
-1178,convert_element_type_210,call_function,convert_element_type.default,forward,8,1,1,1,976,4951,4
-1179,alias_default_250,call_function,alias.default,forward,8,1,1,2,977,4950,4
-1180,neg_8,call_function,neg.default,forward,8,1,1,1,978,4949,8
-1181,exp_8,call_function,exp.default,forward,8,1,1,1,979,4948,6
-1182,add_43,call_function,add.Tensor,forward,8,1,1,1,980,4947,4
-1183,div_8,call_function,div.Tensor,forward,8,2,2,1,981,4946,6
-1184,convert_element_type_211,call_function,convert_element_type.default,forward,8,1,1,1,982,4945,6
-1185,dtype_cast_80,call_function,dtype_cast.default,forward,8,1,1,1,1,4949,3
-1186,permute_97,call_function,permute.default,forward,8,1,1,1,2,4948,3
-1187,alias_default_252,call_function,alias.default,forward,8,1,1,2,3,4947,3
-1188,einsum_default_61,call_function,einsum.default,forward,8,2,2,1,974,4945,5
-1189,alias_default_251,call_function,alias.default,forward,8,1,1,2,983,4944,4
-1190,alias_default_253,call_function,alias.default,forward,8,1,1,2,975,4944,4
-1191,mul_62,call_function,mul.Tensor,forward,8,2,2,1,990,4943,8
-1192,dtype_cast_81,call_function,dtype_cast.default,forward,8,1,1,1,1,4945,3
-1193,permute_98,call_function,permute.default,forward,8,1,1,1,2,4944,3
-1194,alias_default_254,call_function,alias.default,forward,8,1,1,2,991,4942,4
-1195,alias_default_255,call_function,alias.default,forward,8,1,1,2,3,4943,3
-1196,einsum_default_62,call_function,einsum.default,forward,8,2,2,1,996,4941,5
-1197,add_44,call_function,add.Tensor,forward,8,2,2,1,997,4940,10
-1198,dtype_cast_82,call_function,dtype_cast.default,forward,9,1,1,1,1,4929,2
-1199,alias_default_256,call_function,alias.default,forward,8,1,1,3,998,4939,4
-1200,convert_element_type_216,call_function,convert_element_type.default,forward,9,1,1,1,999,4937,4
-1201,alias_default_258,call_function,alias.default,forward,9,1,1,2,1000,4936,4
-1202,pow_19,call_function,pow.Tensor_Scalar,forward,9,1,1,1,1001,4935,4
-1203,mean_18,call_function,mean.dim,forward,9,1,1,1,1002,4934,4
-1204,add_45,call_function,add.Scalar,forward,9,1,1,1,1003,4933,3
-1205,rsqrt_18,call_function,rsqrt.default,forward,9,1,1,1,1004,4932,3
-1206,alias_default_259,call_function,alias.default,forward,9,1,1,3,1005,4931,3
-1207,mul_63,call_function,mul.Tensor,forward,9,2,2,1,1006,4927,8
-1208,alias_default_257,call_function,alias.default,forward,9,1,1,2,2,4928,2
-1209,mul_64,call_function,mul.Tensor,forward,9,2,2,1,1010,4926,8
-1210,convert_element_type_217,call_function,convert_element_type.default,forward,9,1,1,1,1011,4925,6
-1211,dtype_cast_83,call_function,dtype_cast.default,forward,9,1,1,1,1,4912,3
-1212,permute_99,call_function,permute.default,forward,9,1,1,1,2,4911,3
-1213,alias_default_260,call_function,alias.default,forward,9,1,1,6,1012,4924,4
-1214,alias_default_261,call_function,alias.default,forward,9,1,1,2,3,4910,3
-1215,einsum_default_63,call_function,einsum.default,forward,9,2,2,1,1017,4908,5
-1216,dtype_cast_84,call_function,dtype_cast.default,forward,9,1,1,1,1,4912,3
-1217,permute_100,call_function,permute.default,forward,9,1,1,1,2,4911,3
-1218,alias_default_262,call_function,alias.default,forward,9,1,1,2,3,4910,3
-1219,einsum_default_64,call_function,einsum.default,forward,9,2,2,1,1017,4908,5
-1220,dtype_cast_85,call_function,dtype_cast.default,forward,9,1,1,1,1,4905,3
-1221,permute_101,call_function,permute.default,forward,9,1,1,1,2,4904,3
-1222,alias_default_263,call_function,alias.default,forward,9,1,1,2,3,4903,3
-1223,einsum_default_65,call_function,einsum.default,forward,9,2,2,1,1017,4901,5
-1224,view_213,call_function,view.default,forward,9,1,1,1,1018,4907,4
-1225,view_214,call_function,view.default,forward,9,1,1,1,1018,4907,4
-1226,view_215,call_function,view.default,forward,9,1,1,1,1018,4900,4
-1227,convert_element_type_224,call_function,convert_element_type.default,forward,9,1,1,1,1019,4906,4
-1228,view_216,call_function,view.default,forward,9,1,1,1,1020,4905,4
-1229,view_as_complex_18,call_function,view_as_complex.default,forward,9,1,1,1,1021,4904,6
-1230,convert_element_type_225,call_function,convert_element_type.default,forward,9,1,1,1,1019,4906,4
-1231,view_217,call_function,view.default,forward,9,1,1,1,1020,4905,4
-1232,view_as_complex_19,call_function,view_as_complex.default,forward,9,1,1,1,1021,4904,6
-1233,view_218,call_function,view.default,forward,9,1,1,1,2,4915,3
-1234,alias_default_264,call_function,alias.default,forward,9,1,1,4,3,4914,3
-1235,mul_65,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8
-1236,view_as_real_18,call_function,view_as_real.default,forward,9,1,1,1,1025,4902,6
-1237,view_219,call_function,view.default,forward,9,1,1,1,1026,4901,6
-1238,mul_66,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8
-1239,view_as_real_19,call_function,view_as_real.default,forward,9,1,1,1,1025,4902,6
-1240,view_220,call_function,view.default,forward,9,1,1,1,1026,4901,6
-1241,convert_element_type_226,call_function,convert_element_type.default,forward,9,1,1,1,1027,4900,6
-1242,convert_element_type_227,call_function,convert_element_type.default,forward,9,1,1,1,1027,4900,6
-1243,permute_102,call_function,permute.default,forward,9,1,1,1,1028,4899,6
-1244,permute_103,call_function,permute.default,forward,9,1,1,1,1028,4899,6
-1245,permute_104,call_function,permute.default,forward,9,1,1,1,1019,4899,4
-1246,alias_default_265,call_function,alias.default,forward,9,1,1,2,1029,4898,4
-1247,alias_default_266,call_function,alias.default,forward,9,1,1,2,1029,4898,4
-1248,alias_default_267,call_function,alias.default,forward,9,1,1,2,1020,4898,4
-1249,_scaled_dot_product_flash_attention_9,call_function,_scaled_dot_product_flash_attention.default,forward,9,3,3,4,1053,4897,2
-1250,getitem_81,call_function,getitem,forward,9,1,1,1,1054,4893,2
-1251,getitem_82,call_function,getitem,forward,9,1,1,1,1054,1054,2
-1252,getitem_87,call_function,getitem,forward,9,1,1,1,1054,1054,1
-1253,getitem_88,call_function,getitem,forward,9,1,1,1,1054,1054,1
-1254,alias_default_268,call_function,alias.default,forward,9,1,1,2,1055,4892,4
-1255,permute_105,call_function,permute.default,forward,9,1,1,1,1056,4891,4
-1256,view_221,call_function,view.default,forward,9,1,1,1,1057,4890,3
-1257,dtype_cast_86,call_function,dtype_cast.default,forward,9,1,1,1,1,4892,3
-1258,permute_106,call_function,permute.default,forward,9,1,1,1,2,4891,3
-1259,alias_default_269,call_function,alias.default,forward,9,1,1,2,1058,4889,4
-1260,alias_default_270,call_function,alias.default,forward,9,1,1,2,3,4890,3
-1261,einsum_default_66,call_function,einsum.default,forward,9,2,2,1,1063,4888,5
-1262,add_46,call_function,add.Tensor,forward,9,2,2,1,1064,4887,10
-1263,dtype_cast_87,call_function,dtype_cast.default,forward,9,1,1,1,1,4876,2
-1264,alias_default_271,call_function,alias.default,forward,9,1,1,3,1065,4886,4
-1265,convert_element_type_230,call_function,convert_element_type.default,forward,9,1,1,1,1066,4884,4
-1266,alias_default_273,call_function,alias.default,forward,9,1,1,2,1067,4883,4
-1267,pow_20,call_function,pow.Tensor_Scalar,forward,9,1,1,1,1068,4882,4
-1268,mean_19,call_function,mean.dim,forward,9,1,1,1,1069,4881,4
-1269,add_47,call_function,add.Scalar,forward,9,1,1,1,1070,4880,3
-1270,rsqrt_19,call_function,rsqrt.default,forward,9,1,1,1,1071,4879,3
-1271,alias_default_274,call_function,alias.default,forward,9,1,1,3,1072,4878,3
-1272,mul_67,call_function,mul.Tensor,forward,9,2,2,1,1073,4874,8
-1273,alias_default_272,call_function,alias.default,forward,9,1,1,2,2,4875,2
-1274,mul_68,call_function,mul.Tensor,forward,9,2,2,1,1077,4873,8
-1275,convert_element_type_231,call_function,convert_element_type.default,forward,9,1,1,1,1078,4872,6
-1276,dtype_cast_88,call_function,dtype_cast.default,forward,9,1,1,1,1,4872,3
-1277,permute_107,call_function,permute.default,forward,9,1,1,1,2,4871,3
-1278,alias_default_275,call_function,alias.default,forward,9,1,1,4,1079,4871,4
-1279,alias_default_276,call_function,alias.default,forward,9,1,1,2,3,4870,3
-1280,einsum_default_67,call_function,einsum.default,forward,9,2,2,1,1084,4868,5
-1281,alias_default_277,call_function,alias.default,forward,9,1,1,2,1085,4867,4
-1282,convert_element_type_234,call_function,convert_element_type.default,forward,9,1,1,1,1086,4855,4
-1283,alias_default_278,call_function,alias.default,forward,9,1,1,2,1087,4854,4
-1284,neg_9,call_function,neg.default,forward,9,1,1,1,1088,4853,8
-1285,exp_9,call_function,exp.default,forward,9,1,1,1,1089,4852,6
-1286,add_48,call_function,add.Tensor,forward,9,1,1,1,1090,4851,4
-1287,div_9,call_function,div.Tensor,forward,9,2,2,1,1091,4850,6
-1288,convert_element_type_235,call_function,convert_element_type.default,forward,9,1,1,1,1092,4849,6
-1289,dtype_cast_89,call_function,dtype_cast.default,forward,9,1,1,1,1,4853,3
-1290,permute_108,call_function,permute.default,forward,9,1,1,1,2,4852,3
-1291,alias_default_280,call_function,alias.default,forward,9,1,1,2,3,4851,3
-1292,einsum_default_68,call_function,einsum.default,forward,9,2,2,1,1084,4849,5
-1293,alias_default_279,call_function,alias.default,forward,9,1,1,2,1093,4848,4
-1294,alias_default_281,call_function,alias.default,forward,9,1,1,2,1085,4848,4
-1295,mul_69,call_function,mul.Tensor,forward,9,2,2,1,1100,4847,8
-1296,dtype_cast_90,call_function,dtype_cast.default,forward,9,1,1,1,1,4849,3
-1297,permute_109,call_function,permute.default,forward,9,1,1,1,2,4848,3
-1298,alias_default_282,call_function,alias.default,forward,9,1,1,2,1101,4846,4
-1299,alias_default_283,call_function,alias.default,forward,9,1,1,2,3,4847,3
-1300,einsum_default_69,call_function,einsum.default,forward,9,2,2,1,1106,4845,5
-1301,add_49,call_function,add.Tensor,forward,9,2,2,1,1107,4844,10
-1302,dtype_cast_91,call_function,dtype_cast.default,forward,10,1,1,1,1,4833,2
-1303,alias_default_284,call_function,alias.default,forward,9,1,1,3,1108,4843,4
-1304,convert_element_type_240,call_function,convert_element_type.default,forward,10,1,1,1,1109,4841,4
-1305,alias_default_286,call_function,alias.default,forward,10,1,1,2,1110,4840,4
-1306,pow_21,call_function,pow.Tensor_Scalar,forward,10,1,1,1,1111,4839,4
-1307,mean_20,call_function,mean.dim,forward,10,1,1,1,1112,4838,4
-1308,add_50,call_function,add.Scalar,forward,10,1,1,1,1113,4837,3
-1309,rsqrt_20,call_function,rsqrt.default,forward,10,1,1,1,1114,4836,3
-1310,alias_default_287,call_function,alias.default,forward,10,1,1,3,1115,4835,3
-1311,mul_70,call_function,mul.Tensor,forward,10,2,2,1,1116,4831,8
-1312,alias_default_285,call_function,alias.default,forward,10,1,1,2,2,4832,2
-1313,mul_71,call_function,mul.Tensor,forward,10,2,2,1,1120,4830,8
-1314,convert_element_type_241,call_function,convert_element_type.default,forward,10,1,1,1,1121,4829,6
-1315,dtype_cast_92,call_function,dtype_cast.default,forward,10,1,1,1,1,4816,3
-1316,permute_110,call_function,permute.default,forward,10,1,1,1,2,4815,3
-1317,alias_default_288,call_function,alias.default,forward,10,1,1,6,1122,4828,4
-1318,alias_default_289,call_function,alias.default,forward,10,1,1,2,3,4814,3
-1319,einsum_default_70,call_function,einsum.default,forward,10,2,2,1,1127,4812,5
-1320,dtype_cast_93,call_function,dtype_cast.default,forward,10,1,1,1,1,4816,3
-1321,permute_111,call_function,permute.default,forward,10,1,1,1,2,4815,3
-1322,alias_default_290,call_function,alias.default,forward,10,1,1,2,3,4814,3
-1323,einsum_default_71,call_function,einsum.default,forward,10,2,2,1,1127,4812,5
-1324,dtype_cast_94,call_function,dtype_cast.default,forward,10,1,1,1,1,4809,3
-1325,permute_112,call_function,permute.default,forward,10,1,1,1,2,4808,3
-1326,alias_default_291,call_function,alias.default,forward,10,1,1,2,3,4807,3
-1327,einsum_default_72,call_function,einsum.default,forward,10,2,2,1,1127,4805,5
-1328,view_236,call_function,view.default,forward,10,1,1,1,1128,4811,4
-1329,view_237,call_function,view.default,forward,10,1,1,1,1128,4811,4
-1330,view_238,call_function,view.default,forward,10,1,1,1,1128,4804,4
-1331,convert_element_type_248,call_function,convert_element_type.default,forward,10,1,1,1,1129,4810,4
-1332,view_239,call_function,view.default,forward,10,1,1,1,1130,4809,4
-1333,view_as_complex_20,call_function,view_as_complex.default,forward,10,1,1,1,1131,4808,6
-1334,convert_element_type_249,call_function,convert_element_type.default,forward,10,1,1,1,1129,4810,4
-1335,view_240,call_function,view.default,forward,10,1,1,1,1130,4809,4
-1336,view_as_complex_21,call_function,view_as_complex.default,forward,10,1,1,1,1131,4808,6
-1337,view_241,call_function,view.default,forward,10,1,1,1,2,4819,3
-1338,alias_default_292,call_function,alias.default,forward,10,1,1,4,3,4818,3
-1339,mul_72,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8
-1340,view_as_real_20,call_function,view_as_real.default,forward,10,1,1,1,1135,4806,6
-1341,view_242,call_function,view.default,forward,10,1,1,1,1136,4805,6
-1342,mul_73,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8
-1343,view_as_real_21,call_function,view_as_real.default,forward,10,1,1,1,1135,4806,6
-1344,view_243,call_function,view.default,forward,10,1,1,1,1136,4805,6
-1345,convert_element_type_250,call_function,convert_element_type.default,forward,10,1,1,1,1137,4804,6
-1346,convert_element_type_251,call_function,convert_element_type.default,forward,10,1,1,1,1137,4804,6
-1347,permute_113,call_function,permute.default,forward,10,1,1,1,1138,4803,6
-1348,permute_114,call_function,permute.default,forward,10,1,1,1,1138,4803,6
-1349,permute_115,call_function,permute.default,forward,10,1,1,1,1129,4803,4
-1350,alias_default_293,call_function,alias.default,forward,10,1,1,2,1139,4802,4
-1351,alias_default_294,call_function,alias.default,forward,10,1,1,2,1139,4802,4
-1352,alias_default_295,call_function,alias.default,forward,10,1,1,2,1130,4802,4
-1353,_scaled_dot_product_flash_attention_10,call_function,_scaled_dot_product_flash_attention.default,forward,10,3,3,4,1163,4801,2
-1354,getitem_90,call_function,getitem,forward,10,1,1,1,1164,4797,2
-1355,getitem_91,call_function,getitem,forward,10,1,1,1,1164,1164,2
-1356,getitem_96,call_function,getitem,forward,10,1,1,1,1164,1164,1
-1357,getitem_97,call_function,getitem,forward,10,1,1,1,1164,1164,1
-1358,alias_default_296,call_function,alias.default,forward,10,1,1,2,1165,4796,4
-1359,permute_116,call_function,permute.default,forward,10,1,1,1,1166,4795,4
-1360,view_244,call_function,view.default,forward,10,1,1,1,1167,4794,3
-1361,dtype_cast_95,call_function,dtype_cast.default,forward,10,1,1,1,1,4796,3
-1362,permute_117,call_function,permute.default,forward,10,1,1,1,2,4795,3
-1363,alias_default_297,call_function,alias.default,forward,10,1,1,2,1168,4793,4
-1364,alias_default_298,call_function,alias.default,forward,10,1,1,2,3,4794,3
-1365,einsum_default_73,call_function,einsum.default,forward,10,2,2,1,1173,4792,5
-1366,add_51,call_function,add.Tensor,forward,10,2,2,1,1174,4791,10
-1367,dtype_cast_96,call_function,dtype_cast.default,forward,10,1,1,1,1,4780,2
-1368,alias_default_299,call_function,alias.default,forward,10,1,1,3,1175,4790,4
-1369,convert_element_type_254,call_function,convert_element_type.default,forward,10,1,1,1,1176,4788,4
-1370,alias_default_301,call_function,alias.default,forward,10,1,1,2,1177,4787,4
-1371,pow_22,call_function,pow.Tensor_Scalar,forward,10,1,1,1,1178,4786,4
-1372,mean_21,call_function,mean.dim,forward,10,1,1,1,1179,4785,4
-1373,add_52,call_function,add.Scalar,forward,10,1,1,1,1180,4784,3
-1374,rsqrt_21,call_function,rsqrt.default,forward,10,1,1,1,1181,4783,3
-1375,alias_default_302,call_function,alias.default,forward,10,1,1,3,1182,4782,3
-1376,mul_74,call_function,mul.Tensor,forward,10,2,2,1,1183,4778,8
-1377,alias_default_300,call_function,alias.default,forward,10,1,1,2,2,4779,2
-1378,mul_75,call_function,mul.Tensor,forward,10,2,2,1,1187,4777,8
-1379,convert_element_type_255,call_function,convert_element_type.default,forward,10,1,1,1,1188,4776,6
-1380,dtype_cast_97,call_function,dtype_cast.default,forward,10,1,1,1,1,4776,3
-1381,permute_118,call_function,permute.default,forward,10,1,1,1,2,4775,3
-1382,alias_default_303,call_function,alias.default,forward,10,1,1,4,1189,4775,4
-1383,alias_default_304,call_function,alias.default,forward,10,1,1,2,3,4774,3
-1384,einsum_default_74,call_function,einsum.default,forward,10,2,2,1,1194,4772,5
-1385,alias_default_305,call_function,alias.default,forward,10,1,1,2,1195,4771,4
-1386,convert_element_type_258,call_function,convert_element_type.default,forward,10,1,1,1,1196,4759,4
-1387,alias_default_306,call_function,alias.default,forward,10,1,1,2,1197,4758,4
-1388,neg_10,call_function,neg.default,forward,10,1,1,1,1198,4757,8
-1389,exp_10,call_function,exp.default,forward,10,1,1,1,1199,4756,6
-1390,add_53,call_function,add.Tensor,forward,10,1,1,1,1200,4755,4
-1391,div_10,call_function,div.Tensor,forward,10,2,2,1,1201,4754,6
-1392,convert_element_type_259,call_function,convert_element_type.default,forward,10,1,1,1,1202,4753,6
-1393,dtype_cast_98,call_function,dtype_cast.default,forward,10,1,1,1,1,4757,3
-1394,permute_119,call_function,permute.default,forward,10,1,1,1,2,4756,3
-1395,alias_default_308,call_function,alias.default,forward,10,1,1,2,3,4755,3
-1396,einsum_default_75,call_function,einsum.default,forward,10,2,2,1,1194,4753,5
-1397,alias_default_307,call_function,alias.default,forward,10,1,1,2,1203,4752,4
-1398,alias_default_309,call_function,alias.default,forward,10,1,1,2,1195,4752,4
-1399,mul_76,call_function,mul.Tensor,forward,10,2,2,1,1210,4751,8
-1400,dtype_cast_99,call_function,dtype_cast.default,forward,10,1,1,1,1,4753,3
-1401,permute_120,call_function,permute.default,forward,10,1,1,1,2,4752,3
-1402,alias_default_310,call_function,alias.default,forward,10,1,1,2,1211,4750,4
-1403,alias_default_311,call_function,alias.default,forward,10,1,1,2,3,4751,3
-1404,einsum_default_76,call_function,einsum.default,forward,10,2,2,1,1216,4749,5
-1405,add_54,call_function,add.Tensor,forward,10,2,2,1,1217,4748,10
-1406,dtype_cast_100,call_function,dtype_cast.default,forward,11,1,1,1,1,4737,2
-1407,alias_default_312,call_function,alias.default,forward,10,1,1,3,1218,4747,4
-1408,convert_element_type_264,call_function,convert_element_type.default,forward,11,1,1,1,1219,4745,4
-1409,alias_default_314,call_function,alias.default,forward,11,1,1,2,1220,4744,4
-1410,pow_23,call_function,pow.Tensor_Scalar,forward,11,1,1,1,1221,4743,4
-1411,mean_22,call_function,mean.dim,forward,11,1,1,1,1222,4742,4
-1412,add_55,call_function,add.Scalar,forward,11,1,1,1,1223,4741,3
-1413,rsqrt_22,call_function,rsqrt.default,forward,11,1,1,1,1224,4740,3
-1414,alias_default_315,call_function,alias.default,forward,11,1,1,3,1225,4739,3
-1415,mul_77,call_function,mul.Tensor,forward,11,2,2,1,1226,4735,8
-1416,alias_default_313,call_function,alias.default,forward,11,1,1,2,2,4736,2
-1417,mul_78,call_function,mul.Tensor,forward,11,2,2,1,1230,4734,8
-1418,convert_element_type_265,call_function,convert_element_type.default,forward,11,1,1,1,1231,4733,6
-1419,dtype_cast_101,call_function,dtype_cast.default,forward,11,1,1,1,1,4720,3
-1420,permute_121,call_function,permute.default,forward,11,1,1,1,2,4719,3
-1421,alias_default_316,call_function,alias.default,forward,11,1,1,6,1232,4732,4
-1422,alias_default_317,call_function,alias.default,forward,11,1,1,2,3,4718,3
-1423,einsum_default_77,call_function,einsum.default,forward,11,2,2,1,1237,4716,5
-1424,dtype_cast_102,call_function,dtype_cast.default,forward,11,1,1,1,1,4720,3
-1425,permute_122,call_function,permute.default,forward,11,1,1,1,2,4719,3
-1426,alias_default_318,call_function,alias.default,forward,11,1,1,2,3,4718,3
-1427,einsum_default_78,call_function,einsum.default,forward,11,2,2,1,1237,4716,5
-1428,dtype_cast_103,call_function,dtype_cast.default,forward,11,1,1,1,1,4713,3
-1429,permute_123,call_function,permute.default,forward,11,1,1,1,2,4712,3
-1430,alias_default_319,call_function,alias.default,forward,11,1,1,2,3,4711,3
-1431,einsum_default_79,call_function,einsum.default,forward,11,2,2,1,1237,4709,5
-1432,view_259,call_function,view.default,forward,11,1,1,1,1238,4715,4
-1433,view_260,call_function,view.default,forward,11,1,1,1,1238,4715,4
-1434,view_261,call_function,view.default,forward,11,1,1,1,1238,4708,4
-1435,convert_element_type_272,call_function,convert_element_type.default,forward,11,1,1,1,1239,4714,4
-1436,view_262,call_function,view.default,forward,11,1,1,1,1240,4713,4
-1437,view_as_complex_22,call_function,view_as_complex.default,forward,11,1,1,1,1241,4712,6
-1438,convert_element_type_273,call_function,convert_element_type.default,forward,11,1,1,1,1239,4714,4
-1439,view_263,call_function,view.default,forward,11,1,1,1,1240,4713,4
-1440,view_as_complex_23,call_function,view_as_complex.default,forward,11,1,1,1,1241,4712,6
-1441,view_264,call_function,view.default,forward,11,1,1,1,2,4723,3
-1442,alias_default_320,call_function,alias.default,forward,11,1,1,4,3,4722,3
-1443,mul_79,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8
-1444,view_as_real_22,call_function,view_as_real.default,forward,11,1,1,1,1245,4710,6
-1445,view_265,call_function,view.default,forward,11,1,1,1,1246,4709,6
-1446,mul_80,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8
-1447,view_as_real_23,call_function,view_as_real.default,forward,11,1,1,1,1245,4710,6
-1448,view_266,call_function,view.default,forward,11,1,1,1,1246,4709,6
-1449,convert_element_type_274,call_function,convert_element_type.default,forward,11,1,1,1,1247,4708,6
-1450,convert_element_type_275,call_function,convert_element_type.default,forward,11,1,1,1,1247,4708,6
-1451,permute_124,call_function,permute.default,forward,11,1,1,1,1248,4707,6
-1452,permute_125,call_function,permute.default,forward,11,1,1,1,1248,4707,6
-1453,permute_126,call_function,permute.default,forward,11,1,1,1,1239,4707,4
-1454,alias_default_321,call_function,alias.default,forward,11,1,1,2,1249,4706,4
-1455,alias_default_322,call_function,alias.default,forward,11,1,1,2,1249,4706,4
-1456,alias_default_323,call_function,alias.default,forward,11,1,1,2,1240,4706,4
-1457,_scaled_dot_product_flash_attention_11,call_function,_scaled_dot_product_flash_attention.default,forward,11,3,3,4,1273,4705,2
-1458,getitem_99,call_function,getitem,forward,11,1,1,1,1274,4701,2
-1459,getitem_100,call_function,getitem,forward,11,1,1,1,1274,1274,2
-1460,getitem_105,call_function,getitem,forward,11,1,1,1,1274,1274,1
-1461,getitem_106,call_function,getitem,forward,11,1,1,1,1274,1274,1
-1462,alias_default_324,call_function,alias.default,forward,11,1,1,2,1275,4700,4
-1463,permute_127,call_function,permute.default,forward,11,1,1,1,1276,4699,4
-1464,view_267,call_function,view.default,forward,11,1,1,1,1277,4698,3
-1465,dtype_cast_104,call_function,dtype_cast.default,forward,11,1,1,1,1,4700,3
-1466,permute_128,call_function,permute.default,forward,11,1,1,1,2,4699,3
-1467,alias_default_325,call_function,alias.default,forward,11,1,1,2,1278,4697,4
-1468,alias_default_326,call_function,alias.default,forward,11,1,1,2,3,4698,3
-1469,einsum_default_80,call_function,einsum.default,forward,11,2,2,1,1283,4696,5
-1470,add_56,call_function,add.Tensor,forward,11,2,2,1,1284,4695,10
-1471,dtype_cast_105,call_function,dtype_cast.default,forward,11,1,1,1,1,4684,2
-1472,alias_default_327,call_function,alias.default,forward,11,1,1,3,1285,4694,4
-1473,convert_element_type_278,call_function,convert_element_type.default,forward,11,1,1,1,1286,4692,4
-1474,alias_default_329,call_function,alias.default,forward,11,1,1,2,1287,4691,4
-1475,pow_24,call_function,pow.Tensor_Scalar,forward,11,1,1,1,1288,4690,4
-1476,mean_23,call_function,mean.dim,forward,11,1,1,1,1289,4689,4
-1477,add_57,call_function,add.Scalar,forward,11,1,1,1,1290,4688,3
-1478,rsqrt_23,call_function,rsqrt.default,forward,11,1,1,1,1291,4687,3
-1479,alias_default_330,call_function,alias.default,forward,11,1,1,3,1292,4686,3
-1480,mul_81,call_function,mul.Tensor,forward,11,2,2,1,1293,4682,8
-1481,alias_default_328,call_function,alias.default,forward,11,1,1,2,2,4683,2
-1482,mul_82,call_function,mul.Tensor,forward,11,2,2,1,1297,4681,8
-1483,convert_element_type_279,call_function,convert_element_type.default,forward,11,1,1,1,1298,4680,6
-1484,dtype_cast_106,call_function,dtype_cast.default,forward,11,1,1,1,1,4680,3
-1485,permute_129,call_function,permute.default,forward,11,1,1,1,2,4679,3
-1486,alias_default_331,call_function,alias.default,forward,11,1,1,4,1299,4679,4
-1487,alias_default_332,call_function,alias.default,forward,11,1,1,2,3,4678,3
-1488,einsum_default_81,call_function,einsum.default,forward,11,2,2,1,1304,4676,5
-1489,alias_default_333,call_function,alias.default,forward,11,1,1,2,1305,4675,4
-1490,convert_element_type_282,call_function,convert_element_type.default,forward,11,1,1,1,1306,4663,4
-1491,alias_default_334,call_function,alias.default,forward,11,1,1,2,1307,4662,4
-1492,neg_11,call_function,neg.default,forward,11,1,1,1,1308,4661,8
-1493,exp_11,call_function,exp.default,forward,11,1,1,1,1309,4660,6
-1494,add_58,call_function,add.Tensor,forward,11,1,1,1,1310,4659,4
-1495,div_11,call_function,div.Tensor,forward,11,2,2,1,1311,4658,6
-1496,convert_element_type_283,call_function,convert_element_type.default,forward,11,1,1,1,1312,4657,6
-1497,dtype_cast_107,call_function,dtype_cast.default,forward,11,1,1,1,1,4661,3
-1498,permute_130,call_function,permute.default,forward,11,1,1,1,2,4660,3
-1499,alias_default_336,call_function,alias.default,forward,11,1,1,2,3,4659,3
-1500,einsum_default_82,call_function,einsum.default,forward,11,2,2,1,1304,4657,5
-1501,alias_default_335,call_function,alias.default,forward,11,1,1,2,1313,4656,4
-1502,alias_default_337,call_function,alias.default,forward,11,1,1,2,1305,4656,4
-1503,mul_83,call_function,mul.Tensor,forward,11,2,2,1,1320,4655,8
-1504,dtype_cast_108,call_function,dtype_cast.default,forward,11,1,1,1,1,4657,3
-1505,permute_131,call_function,permute.default,forward,11,1,1,1,2,4656,3
-1506,alias_default_338,call_function,alias.default,forward,11,1,1,2,1321,4654,4
-1507,alias_default_339,call_function,alias.default,forward,11,1,1,2,3,4655,3
-1508,einsum_default_83,call_function,einsum.default,forward,11,2,2,1,1326,4653,5
-1509,add_59,call_function,add.Tensor,forward,11,2,2,1,1327,4652,10
-1510,dtype_cast_109,call_function,dtype_cast.default,forward,12,1,1,1,1,4641,2
-1511,alias_default_340,call_function,alias.default,forward,11,1,1,3,1328,4651,4
-1512,convert_element_type_288,call_function,convert_element_type.default,forward,12,1,1,1,1329,4649,4
-1513,alias_default_342,call_function,alias.default,forward,12,1,1,2,1330,4648,4
-1514,pow_25,call_function,pow.Tensor_Scalar,forward,12,1,1,1,1331,4647,4
-1515,mean_24,call_function,mean.dim,forward,12,1,1,1,1332,4646,4
-1516,add_60,call_function,add.Scalar,forward,12,1,1,1,1333,4645,3
-1517,rsqrt_24,call_function,rsqrt.default,forward,12,1,1,1,1334,4644,3
-1518,alias_default_343,call_function,alias.default,forward,12,1,1,3,1335,4643,3
-1519,mul_84,call_function,mul.Tensor,forward,12,2,2,1,1336,4639,8
-1520,alias_default_341,call_function,alias.default,forward,12,1,1,2,2,4640,2
-1521,mul_85,call_function,mul.Tensor,forward,12,2,2,1,1340,4638,8
-1522,convert_element_type_289,call_function,convert_element_type.default,forward,12,1,1,1,1341,4637,6
-1523,dtype_cast_110,call_function,dtype_cast.default,forward,12,1,1,1,1,4624,3
-1524,permute_132,call_function,permute.default,forward,12,1,1,1,2,4623,3
-1525,alias_default_344,call_function,alias.default,forward,12,1,1,6,1342,4636,4
-1526,alias_default_345,call_function,alias.default,forward,12,1,1,2,3,4622,3
-1527,einsum_default_84,call_function,einsum.default,forward,12,2,2,1,1347,4620,5
-1528,dtype_cast_111,call_function,dtype_cast.default,forward,12,1,1,1,1,4624,3
-1529,permute_133,call_function,permute.default,forward,12,1,1,1,2,4623,3
-1530,alias_default_346,call_function,alias.default,forward,12,1,1,2,3,4622,3
-1531,einsum_default_85,call_function,einsum.default,forward,12,2,2,1,1347,4620,5
-1532,dtype_cast_112,call_function,dtype_cast.default,forward,12,1,1,1,1,4617,3
-1533,permute_134,call_function,permute.default,forward,12,1,1,1,2,4616,3
-1534,alias_default_347,call_function,alias.default,forward,12,1,1,2,3,4615,3
-1535,einsum_default_86,call_function,einsum.default,forward,12,2,2,1,1347,4613,5
-1536,view_282,call_function,view.default,forward,12,1,1,1,1348,4619,4
-1537,view_283,call_function,view.default,forward,12,1,1,1,1348,4619,4
-1538,view_284,call_function,view.default,forward,12,1,1,1,1348,4612,4
-1539,convert_element_type_296,call_function,convert_element_type.default,forward,12,1,1,1,1349,4618,4
-1540,view_285,call_function,view.default,forward,12,1,1,1,1350,4617,4
-1541,view_as_complex_24,call_function,view_as_complex.default,forward,12,1,1,1,1351,4616,6
-1542,convert_element_type_297,call_function,convert_element_type.default,forward,12,1,1,1,1349,4618,4
-1543,view_286,call_function,view.default,forward,12,1,1,1,1350,4617,4
-1544,view_as_complex_25,call_function,view_as_complex.default,forward,12,1,1,1,1351,4616,6
-1545,view_287,call_function,view.default,forward,12,1,1,1,2,4627,3
-1546,alias_default_348,call_function,alias.default,forward,12,1,1,4,3,4626,3
-1547,mul_86,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8
-1548,view_as_real_24,call_function,view_as_real.default,forward,12,1,1,1,1355,4614,6
-1549,view_288,call_function,view.default,forward,12,1,1,1,1356,4613,6
-1550,mul_87,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8
-1551,view_as_real_25,call_function,view_as_real.default,forward,12,1,1,1,1355,4614,6
-1552,view_289,call_function,view.default,forward,12,1,1,1,1356,4613,6
-1553,convert_element_type_298,call_function,convert_element_type.default,forward,12,1,1,1,1357,4612,6
-1554,convert_element_type_299,call_function,convert_element_type.default,forward,12,1,1,1,1357,4612,6
-1555,permute_135,call_function,permute.default,forward,12,1,1,1,1358,4611,6
-1556,permute_136,call_function,permute.default,forward,12,1,1,1,1358,4611,6
-1557,permute_137,call_function,permute.default,forward,12,1,1,1,1349,4611,4
-1558,alias_default_349,call_function,alias.default,forward,12,1,1,2,1359,4610,4
-1559,alias_default_350,call_function,alias.default,forward,12,1,1,2,1359,4610,4
-1560,alias_default_351,call_function,alias.default,forward,12,1,1,2,1350,4610,4
-1561,_scaled_dot_product_flash_attention_12,call_function,_scaled_dot_product_flash_attention.default,forward,12,3,3,4,1383,4609,2
-1562,getitem_108,call_function,getitem,forward,12,1,1,1,1384,4605,2
-1563,getitem_109,call_function,getitem,forward,12,1,1,1,1384,1384,2
-1564,getitem_114,call_function,getitem,forward,12,1,1,1,1384,1384,1
-1565,getitem_115,call_function,getitem,forward,12,1,1,1,1384,1384,1
-1566,alias_default_352,call_function,alias.default,forward,12,1,1,2,1385,4604,4
-1567,permute_138,call_function,permute.default,forward,12,1,1,1,1386,4603,4
-1568,view_290,call_function,view.default,forward,12,1,1,1,1387,4602,3
-1569,dtype_cast_113,call_function,dtype_cast.default,forward,12,1,1,1,1,4604,3
-1570,permute_139,call_function,permute.default,forward,12,1,1,1,2,4603,3
-1571,alias_default_353,call_function,alias.default,forward,12,1,1,2,1388,4601,4
-1572,alias_default_354,call_function,alias.default,forward,12,1,1,2,3,4602,3
-1573,einsum_default_87,call_function,einsum.default,forward,12,2,2,1,1393,4600,5
-1574,add_61,call_function,add.Tensor,forward,12,2,2,1,1394,4599,10
-1575,dtype_cast_114,call_function,dtype_cast.default,forward,12,1,1,1,1,4588,2
-1576,alias_default_355,call_function,alias.default,forward,12,1,1,3,1395,4598,4
-1577,convert_element_type_302,call_function,convert_element_type.default,forward,12,1,1,1,1396,4596,4
-1578,alias_default_357,call_function,alias.default,forward,12,1,1,2,1397,4595,4
-1579,pow_26,call_function,pow.Tensor_Scalar,forward,12,1,1,1,1398,4594,4
-1580,mean_25,call_function,mean.dim,forward,12,1,1,1,1399,4593,4
-1581,add_62,call_function,add.Scalar,forward,12,1,1,1,1400,4592,3
-1582,rsqrt_25,call_function,rsqrt.default,forward,12,1,1,1,1401,4591,3
-1583,alias_default_358,call_function,alias.default,forward,12,1,1,3,1402,4590,3
-1584,mul_88,call_function,mul.Tensor,forward,12,2,2,1,1403,4586,8
-1585,alias_default_356,call_function,alias.default,forward,12,1,1,2,2,4587,2
-1586,mul_89,call_function,mul.Tensor,forward,12,2,2,1,1407,4585,8
-1587,convert_element_type_303,call_function,convert_element_type.default,forward,12,1,1,1,1408,4584,6
-1588,dtype_cast_115,call_function,dtype_cast.default,forward,12,1,1,1,1,4584,3
-1589,permute_140,call_function,permute.default,forward,12,1,1,1,2,4583,3
-1590,alias_default_359,call_function,alias.default,forward,12,1,1,4,1409,4583,4
-1591,alias_default_360,call_function,alias.default,forward,12,1,1,2,3,4582,3
-1592,einsum_default_88,call_function,einsum.default,forward,12,2,2,1,1414,4580,5
-1593,alias_default_361,call_function,alias.default,forward,12,1,1,2,1415,4579,4
-1594,convert_element_type_306,call_function,convert_element_type.default,forward,12,1,1,1,1416,4567,4
-1595,alias_default_362,call_function,alias.default,forward,12,1,1,2,1417,4566,4
-1596,neg_12,call_function,neg.default,forward,12,1,1,1,1418,4565,8
-1597,exp_12,call_function,exp.default,forward,12,1,1,1,1419,4564,6
-1598,add_63,call_function,add.Tensor,forward,12,1,1,1,1420,4563,4
-1599,div_12,call_function,div.Tensor,forward,12,2,2,1,1421,4562,6
-1600,convert_element_type_307,call_function,convert_element_type.default,forward,12,1,1,1,1422,4561,6
-1601,dtype_cast_116,call_function,dtype_cast.default,forward,12,1,1,1,1,4565,3
-1602,permute_141,call_function,permute.default,forward,12,1,1,1,2,4564,3
-1603,alias_default_364,call_function,alias.default,forward,12,1,1,2,3,4563,3
-1604,einsum_default_89,call_function,einsum.default,forward,12,2,2,1,1414,4561,5
-1605,alias_default_363,call_function,alias.default,forward,12,1,1,2,1423,4560,4
-1606,alias_default_365,call_function,alias.default,forward,12,1,1,2,1415,4560,4
-1607,mul_90,call_function,mul.Tensor,forward,12,2,2,1,1430,4559,8
-1608,dtype_cast_117,call_function,dtype_cast.default,forward,12,1,1,1,1,4561,3
-1609,permute_142,call_function,permute.default,forward,12,1,1,1,2,4560,3
-1610,alias_default_366,call_function,alias.default,forward,12,1,1,2,1431,4558,4
-1611,alias_default_367,call_function,alias.default,forward,12,1,1,2,3,4559,3
-1612,einsum_default_90,call_function,einsum.default,forward,12,2,2,1,1436,4557,5
-1613,add_64,call_function,add.Tensor,forward,12,2,2,1,1437,4556,10
-1614,dtype_cast_118,call_function,dtype_cast.default,forward,13,1,1,1,1,4545,2
-1615,alias_default_368,call_function,alias.default,forward,12,1,1,3,1438,4555,4
-1616,convert_element_type_312,call_function,convert_element_type.default,forward,13,1,1,1,1439,4553,4
-1617,alias_default_370,call_function,alias.default,forward,13,1,1,2,1440,4552,4
-1618,pow_27,call_function,pow.Tensor_Scalar,forward,13,1,1,1,1441,4551,4
-1619,mean_26,call_function,mean.dim,forward,13,1,1,1,1442,4550,4
-1620,add_65,call_function,add.Scalar,forward,13,1,1,1,1443,4549,3
-1621,rsqrt_26,call_function,rsqrt.default,forward,13,1,1,1,1444,4548,3
-1622,alias_default_371,call_function,alias.default,forward,13,1,1,3,1445,4547,3
-1623,mul_91,call_function,mul.Tensor,forward,13,2,2,1,1446,4543,8
-1624,alias_default_369,call_function,alias.default,forward,13,1,1,2,2,4544,2
-1625,mul_92,call_function,mul.Tensor,forward,13,2,2,1,1450,4542,8
-1626,convert_element_type_313,call_function,convert_element_type.default,forward,13,1,1,1,1451,4541,6
-1627,dtype_cast_119,call_function,dtype_cast.default,forward,13,1,1,1,1,4528,3
-1628,permute_143,call_function,permute.default,forward,13,1,1,1,2,4527,3
-1629,alias_default_372,call_function,alias.default,forward,13,1,1,6,1452,4540,4
-1630,alias_default_373,call_function,alias.default,forward,13,1,1,2,3,4526,3
-1631,einsum_default_91,call_function,einsum.default,forward,13,2,2,1,1457,4524,5
-1632,dtype_cast_120,call_function,dtype_cast.default,forward,13,1,1,1,1,4528,3
-1633,permute_144,call_function,permute.default,forward,13,1,1,1,2,4527,3
-1634,alias_default_374,call_function,alias.default,forward,13,1,1,2,3,4526,3
-1635,einsum_default_92,call_function,einsum.default,forward,13,2,2,1,1457,4524,5
-1636,dtype_cast_121,call_function,dtype_cast.default,forward,13,1,1,1,1,4521,3
-1637,permute_145,call_function,permute.default,forward,13,1,1,1,2,4520,3
-1638,alias_default_375,call_function,alias.default,forward,13,1,1,2,3,4519,3
-1639,einsum_default_93,call_function,einsum.default,forward,13,2,2,1,1457,4517,5
-1640,view_305,call_function,view.default,forward,13,1,1,1,1458,4523,4
-1641,view_306,call_function,view.default,forward,13,1,1,1,1458,4523,4
-1642,view_307,call_function,view.default,forward,13,1,1,1,1458,4516,4
-1643,convert_element_type_320,call_function,convert_element_type.default,forward,13,1,1,1,1459,4522,4
-1644,view_308,call_function,view.default,forward,13,1,1,1,1460,4521,4
-1645,view_as_complex_26,call_function,view_as_complex.default,forward,13,1,1,1,1461,4520,6
-1646,convert_element_type_321,call_function,convert_element_type.default,forward,13,1,1,1,1459,4522,4
-1647,view_309,call_function,view.default,forward,13,1,1,1,1460,4521,4
-1648,view_as_complex_27,call_function,view_as_complex.default,forward,13,1,1,1,1461,4520,6
-1649,view_310,call_function,view.default,forward,13,1,1,1,2,4531,3
-1650,alias_default_376,call_function,alias.default,forward,13,1,1,4,3,4530,3
-1651,mul_93,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8
-1652,view_as_real_26,call_function,view_as_real.default,forward,13,1,1,1,1465,4518,6
-1653,view_311,call_function,view.default,forward,13,1,1,1,1466,4517,6
-1654,mul_94,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8
-1655,view_as_real_27,call_function,view_as_real.default,forward,13,1,1,1,1465,4518,6
-1656,view_312,call_function,view.default,forward,13,1,1,1,1466,4517,6
-1657,convert_element_type_322,call_function,convert_element_type.default,forward,13,1,1,1,1467,4516,6
-1658,convert_element_type_323,call_function,convert_element_type.default,forward,13,1,1,1,1467,4516,6
-1659,permute_146,call_function,permute.default,forward,13,1,1,1,1468,4515,6
-1660,permute_147,call_function,permute.default,forward,13,1,1,1,1468,4515,6
-1661,permute_148,call_function,permute.default,forward,13,1,1,1,1459,4515,4
-1662,alias_default_377,call_function,alias.default,forward,13,1,1,2,1469,4514,4
-1663,alias_default_378,call_function,alias.default,forward,13,1,1,2,1469,4514,4
-1664,alias_default_379,call_function,alias.default,forward,13,1,1,2,1460,4514,4
-1665,_scaled_dot_product_flash_attention_13,call_function,_scaled_dot_product_flash_attention.default,forward,13,3,3,4,1493,4513,2
-1666,getitem_117,call_function,getitem,forward,13,1,1,1,1494,4509,2
-1667,getitem_118,call_function,getitem,forward,13,1,1,1,1494,1494,2
-1668,getitem_123,call_function,getitem,forward,13,1,1,1,1494,1494,1
-1669,getitem_124,call_function,getitem,forward,13,1,1,1,1494,1494,1
-1670,alias_default_380,call_function,alias.default,forward,13,1,1,2,1495,4508,4
-1671,permute_149,call_function,permute.default,forward,13,1,1,1,1496,4507,4
-1672,view_313,call_function,view.default,forward,13,1,1,1,1497,4506,3
-1673,dtype_cast_122,call_function,dtype_cast.default,forward,13,1,1,1,1,4508,3
-1674,permute_150,call_function,permute.default,forward,13,1,1,1,2,4507,3
-1675,alias_default_381,call_function,alias.default,forward,13,1,1,2,1498,4505,4
-1676,alias_default_382,call_function,alias.default,forward,13,1,1,2,3,4506,3
-1677,einsum_default_94,call_function,einsum.default,forward,13,2,2,1,1503,4504,5
-1678,add_66,call_function,add.Tensor,forward,13,2,2,1,1504,4503,10
-1679,dtype_cast_123,call_function,dtype_cast.default,forward,13,1,1,1,1,4492,2
-1680,alias_default_383,call_function,alias.default,forward,13,1,1,3,1505,4502,4
-1681,convert_element_type_326,call_function,convert_element_type.default,forward,13,1,1,1,1506,4500,4
-1682,alias_default_385,call_function,alias.default,forward,13,1,1,2,1507,4499,4
-1683,pow_28,call_function,pow.Tensor_Scalar,forward,13,1,1,1,1508,4498,4
-1684,mean_27,call_function,mean.dim,forward,13,1,1,1,1509,4497,4
-1685,add_67,call_function,add.Scalar,forward,13,1,1,1,1510,4496,3
-1686,rsqrt_27,call_function,rsqrt.default,forward,13,1,1,1,1511,4495,3
-1687,alias_default_386,call_function,alias.default,forward,13,1,1,3,1512,4494,3
-1688,mul_95,call_function,mul.Tensor,forward,13,2,2,1,1513,4490,8
-1689,alias_default_384,call_function,alias.default,forward,13,1,1,2,2,4491,2
-1690,mul_96,call_function,mul.Tensor,forward,13,2,2,1,1517,4489,8
-1691,convert_element_type_327,call_function,convert_element_type.default,forward,13,1,1,1,1518,4488,6
-1692,dtype_cast_124,call_function,dtype_cast.default,forward,13,1,1,1,1,4488,3
-1693,permute_151,call_function,permute.default,forward,13,1,1,1,2,4487,3
-1694,alias_default_387,call_function,alias.default,forward,13,1,1,4,1519,4487,4
-1695,alias_default_388,call_function,alias.default,forward,13,1,1,2,3,4486,3
-1696,einsum_default_95,call_function,einsum.default,forward,13,2,2,1,1524,4484,5
-1697,alias_default_389,call_function,alias.default,forward,13,1,1,2,1525,4483,4
-1698,convert_element_type_330,call_function,convert_element_type.default,forward,13,1,1,1,1526,4471,4
-1699,alias_default_390,call_function,alias.default,forward,13,1,1,2,1527,4470,4
-1700,neg_13,call_function,neg.default,forward,13,1,1,1,1528,4469,8
-1701,exp_13,call_function,exp.default,forward,13,1,1,1,1529,4468,6
-1702,add_68,call_function,add.Tensor,forward,13,1,1,1,1530,4467,4
-1703,div_13,call_function,div.Tensor,forward,13,2,2,1,1531,4466,6
-1704,convert_element_type_331,call_function,convert_element_type.default,forward,13,1,1,1,1532,4465,6
-1705,dtype_cast_125,call_function,dtype_cast.default,forward,13,1,1,1,1,4469,3
-1706,permute_152,call_function,permute.default,forward,13,1,1,1,2,4468,3
-1707,alias_default_392,call_function,alias.default,forward,13,1,1,2,3,4467,3
-1708,einsum_default_96,call_function,einsum.default,forward,13,2,2,1,1524,4465,5
-1709,alias_default_391,call_function,alias.default,forward,13,1,1,2,1533,4464,4
-1710,alias_default_393,call_function,alias.default,forward,13,1,1,2,1525,4464,4
-1711,mul_97,call_function,mul.Tensor,forward,13,2,2,1,1540,4463,8
-1712,dtype_cast_126,call_function,dtype_cast.default,forward,13,1,1,1,1,4465,3
-1713,permute_153,call_function,permute.default,forward,13,1,1,1,2,4464,3
-1714,alias_default_394,call_function,alias.default,forward,13,1,1,2,1541,4462,4
-1715,alias_default_395,call_function,alias.default,forward,13,1,1,2,3,4463,3
-1716,einsum_default_97,call_function,einsum.default,forward,13,2,2,1,1546,4461,5
-1717,add_69,call_function,add.Tensor,forward,13,2,2,1,1547,4460,10
-1718,dtype_cast_127,call_function,dtype_cast.default,forward,14,1,1,1,1,4449,2
-1719,alias_default_396,call_function,alias.default,forward,13,1,1,3,1548,4459,4
-1720,convert_element_type_336,call_function,convert_element_type.default,forward,14,1,1,1,1549,4457,4
-1721,alias_default_398,call_function,alias.default,forward,14,1,1,2,1550,4456,4
-1722,pow_29,call_function,pow.Tensor_Scalar,forward,14,1,1,1,1551,4455,4
-1723,mean_28,call_function,mean.dim,forward,14,1,1,1,1552,4454,4
-1724,add_70,call_function,add.Scalar,forward,14,1,1,1,1553,4453,3
-1725,rsqrt_28,call_function,rsqrt.default,forward,14,1,1,1,1554,4452,3
-1726,alias_default_399,call_function,alias.default,forward,14,1,1,3,1555,4451,3
-1727,mul_98,call_function,mul.Tensor,forward,14,2,2,1,1556,4447,8
-1728,alias_default_397,call_function,alias.default,forward,14,1,1,2,2,4448,2
-1729,mul_99,call_function,mul.Tensor,forward,14,2,2,1,1560,4446,8
-1730,convert_element_type_337,call_function,convert_element_type.default,forward,14,1,1,1,1561,4445,6
-1731,dtype_cast_128,call_function,dtype_cast.default,forward,14,1,1,1,1,4432,3
-1732,permute_154,call_function,permute.default,forward,14,1,1,1,2,4431,3
-1733,alias_default_400,call_function,alias.default,forward,14,1,1,6,1562,4444,4
-1734,alias_default_401,call_function,alias.default,forward,14,1,1,2,3,4430,3
-1735,einsum_default_98,call_function,einsum.default,forward,14,2,2,1,1567,4428,5
-1736,dtype_cast_129,call_function,dtype_cast.default,forward,14,1,1,1,1,4432,3
-1737,permute_155,call_function,permute.default,forward,14,1,1,1,2,4431,3
-1738,alias_default_402,call_function,alias.default,forward,14,1,1,2,3,4430,3
-1739,einsum_default_99,call_function,einsum.default,forward,14,2,2,1,1567,4428,5
-1740,dtype_cast_130,call_function,dtype_cast.default,forward,14,1,1,1,1,4425,3
-1741,permute_156,call_function,permute.default,forward,14,1,1,1,2,4424,3
-1742,alias_default_403,call_function,alias.default,forward,14,1,1,2,3,4423,3
-1743,einsum_default_100,call_function,einsum.default,forward,14,2,2,1,1567,4421,5
-1744,view_328,call_function,view.default,forward,14,1,1,1,1568,4427,4
-1745,view_329,call_function,view.default,forward,14,1,1,1,1568,4427,4
-1746,view_330,call_function,view.default,forward,14,1,1,1,1568,4420,4
-1747,convert_element_type_344,call_function,convert_element_type.default,forward,14,1,1,1,1569,4426,4
-1748,view_331,call_function,view.default,forward,14,1,1,1,1570,4425,4
-1749,view_as_complex_28,call_function,view_as_complex.default,forward,14,1,1,1,1571,4424,6
-1750,convert_element_type_345,call_function,convert_element_type.default,forward,14,1,1,1,1569,4426,4
-1751,view_332,call_function,view.default,forward,14,1,1,1,1570,4425,4
-1752,view_as_complex_29,call_function,view_as_complex.default,forward,14,1,1,1,1571,4424,6
-1753,view_333,call_function,view.default,forward,14,1,1,1,2,4435,3
-1754,alias_default_404,call_function,alias.default,forward,14,1,1,4,3,4434,3
-1755,mul_100,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8
-1756,view_as_real_28,call_function,view_as_real.default,forward,14,1,1,1,1575,4422,6
-1757,view_334,call_function,view.default,forward,14,1,1,1,1576,4421,6
-1758,mul_101,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8
-1759,view_as_real_29,call_function,view_as_real.default,forward,14,1,1,1,1575,4422,6
-1760,view_335,call_function,view.default,forward,14,1,1,1,1576,4421,6
-1761,convert_element_type_346,call_function,convert_element_type.default,forward,14,1,1,1,1577,4420,6
-1762,convert_element_type_347,call_function,convert_element_type.default,forward,14,1,1,1,1577,4420,6
-1763,permute_157,call_function,permute.default,forward,14,1,1,1,1578,4419,6
-1764,permute_158,call_function,permute.default,forward,14,1,1,1,1578,4419,6
-1765,permute_159,call_function,permute.default,forward,14,1,1,1,1569,4419,4
-1766,alias_default_405,call_function,alias.default,forward,14,1,1,2,1579,4418,4
-1767,alias_default_406,call_function,alias.default,forward,14,1,1,2,1579,4418,4
-1768,alias_default_407,call_function,alias.default,forward,14,1,1,2,1570,4418,4
-1769,_scaled_dot_product_flash_attention_14,call_function,_scaled_dot_product_flash_attention.default,forward,14,3,3,4,1603,4417,2
-1770,getitem_126,call_function,getitem,forward,14,1,1,1,1604,4413,2
-1771,getitem_127,call_function,getitem,forward,14,1,1,1,1604,1604,2
-1772,getitem_132,call_function,getitem,forward,14,1,1,1,1604,1604,1
-1773,getitem_133,call_function,getitem,forward,14,1,1,1,1604,1604,1
-1774,alias_default_408,call_function,alias.default,forward,14,1,1,2,1605,4412,4
-1775,permute_160,call_function,permute.default,forward,14,1,1,1,1606,4411,4
-1776,view_336,call_function,view.default,forward,14,1,1,1,1607,4410,3
-1777,dtype_cast_131,call_function,dtype_cast.default,forward,14,1,1,1,1,4412,3
-1778,permute_161,call_function,permute.default,forward,14,1,1,1,2,4411,3
-1779,alias_default_409,call_function,alias.default,forward,14,1,1,2,1608,4409,4
-1780,alias_default_410,call_function,alias.default,forward,14,1,1,2,3,4410,3
-1781,einsum_default_101,call_function,einsum.default,forward,14,2,2,1,1613,4408,5
-1782,add_71,call_function,add.Tensor,forward,14,2,2,1,1614,4407,10
-1783,dtype_cast_132,call_function,dtype_cast.default,forward,14,1,1,1,1,4396,2
-1784,alias_default_411,call_function,alias.default,forward,14,1,1,3,1615,4406,4
-1785,convert_element_type_350,call_function,convert_element_type.default,forward,14,1,1,1,1616,4404,4
-1786,alias_default_413,call_function,alias.default,forward,14,1,1,2,1617,4403,4
-1787,pow_30,call_function,pow.Tensor_Scalar,forward,14,1,1,1,1618,4402,4
-1788,mean_29,call_function,mean.dim,forward,14,1,1,1,1619,4401,4
-1789,add_72,call_function,add.Scalar,forward,14,1,1,1,1620,4400,3
-1790,rsqrt_29,call_function,rsqrt.default,forward,14,1,1,1,1621,4399,3
-1791,alias_default_414,call_function,alias.default,forward,14,1,1,3,1622,4398,3
-1792,mul_102,call_function,mul.Tensor,forward,14,2,2,1,1623,4394,8
-1793,alias_default_412,call_function,alias.default,forward,14,1,1,2,2,4395,2
-1794,mul_103,call_function,mul.Tensor,forward,14,2,2,1,1627,4393,8
-1795,convert_element_type_351,call_function,convert_element_type.default,forward,14,1,1,1,1628,4392,6
-1796,dtype_cast_133,call_function,dtype_cast.default,forward,14,1,1,1,1,4392,3
-1797,permute_162,call_function,permute.default,forward,14,1,1,1,2,4391,3
-1798,alias_default_415,call_function,alias.default,forward,14,1,1,4,1629,4391,4
-1799,alias_default_416,call_function,alias.default,forward,14,1,1,2,3,4390,3
-1800,einsum_default_102,call_function,einsum.default,forward,14,2,2,1,1634,4388,5
-1801,alias_default_417,call_function,alias.default,forward,14,1,1,2,1635,4387,4
-1802,convert_element_type_354,call_function,convert_element_type.default,forward,14,1,1,1,1636,4375,4
-1803,alias_default_418,call_function,alias.default,forward,14,1,1,2,1637,4374,4
-1804,neg_14,call_function,neg.default,forward,14,1,1,1,1638,4373,8
-1805,exp_14,call_function,exp.default,forward,14,1,1,1,1639,4372,6
-1806,add_73,call_function,add.Tensor,forward,14,1,1,1,1640,4371,4
-1807,div_14,call_function,div.Tensor,forward,14,2,2,1,1641,4370,6
-1808,convert_element_type_355,call_function,convert_element_type.default,forward,14,1,1,1,1642,4369,6
-1809,dtype_cast_134,call_function,dtype_cast.default,forward,14,1,1,1,1,4373,3
-1810,permute_163,call_function,permute.default,forward,14,1,1,1,2,4372,3
-1811,alias_default_420,call_function,alias.default,forward,14,1,1,2,3,4371,3
-1812,einsum_default_103,call_function,einsum.default,forward,14,2,2,1,1634,4369,5
-1813,alias_default_419,call_function,alias.default,forward,14,1,1,2,1643,4368,4
-1814,alias_default_421,call_function,alias.default,forward,14,1,1,2,1635,4368,4
-1815,mul_104,call_function,mul.Tensor,forward,14,2,2,1,1650,4367,8
-1816,dtype_cast_135,call_function,dtype_cast.default,forward,14,1,1,1,1,4369,3
-1817,permute_164,call_function,permute.default,forward,14,1,1,1,2,4368,3
-1818,alias_default_422,call_function,alias.default,forward,14,1,1,2,1651,4366,4
-1819,alias_default_423,call_function,alias.default,forward,14,1,1,2,3,4367,3
-1820,einsum_default_104,call_function,einsum.default,forward,14,2,2,1,1656,4365,5
-1821,add_74,call_function,add.Tensor,forward,14,2,2,1,1657,4364,10
-1822,dtype_cast_136,call_function,dtype_cast.default,forward,15,1,1,1,1,4353,2
-1823,alias_default_424,call_function,alias.default,forward,14,1,1,3,1658,4363,4
-1824,convert_element_type_360,call_function,convert_element_type.default,forward,15,1,1,1,1659,4361,4
-1825,alias_default_426,call_function,alias.default,forward,15,1,1,2,1660,4360,4
-1826,pow_31,call_function,pow.Tensor_Scalar,forward,15,1,1,1,1661,4359,4
-1827,mean_30,call_function,mean.dim,forward,15,1,1,1,1662,4358,4
-1828,add_75,call_function,add.Scalar,forward,15,1,1,1,1663,4357,3
-1829,rsqrt_30,call_function,rsqrt.default,forward,15,1,1,1,1664,4356,3
-1830,alias_default_427,call_function,alias.default,forward,15,1,1,3,1665,4355,3
-1831,mul_105,call_function,mul.Tensor,forward,15,2,2,1,1666,4351,8
-1832,alias_default_425,call_function,alias.default,forward,15,1,1,2,2,4352,2
-1833,mul_106,call_function,mul.Tensor,forward,15,2,2,1,1670,4350,8
-1834,convert_element_type_361,call_function,convert_element_type.default,forward,15,1,1,1,1671,4349,6
-1835,dtype_cast_137,call_function,dtype_cast.default,forward,15,1,1,1,1,4336,3
-1836,permute_165,call_function,permute.default,forward,15,1,1,1,2,4335,3
-1837,alias_default_428,call_function,alias.default,forward,15,1,1,6,1672,4348,4
-1838,alias_default_429,call_function,alias.default,forward,15,1,1,2,3,4334,3
-1839,einsum_default_105,call_function,einsum.default,forward,15,2,2,1,1677,4332,5
-1840,dtype_cast_138,call_function,dtype_cast.default,forward,15,1,1,1,1,4336,3
-1841,permute_166,call_function,permute.default,forward,15,1,1,1,2,4335,3
-1842,alias_default_430,call_function,alias.default,forward,15,1,1,2,3,4334,3
-1843,einsum_default_106,call_function,einsum.default,forward,15,2,2,1,1677,4332,5
-1844,dtype_cast_139,call_function,dtype_cast.default,forward,15,1,1,1,1,4329,3
-1845,permute_167,call_function,permute.default,forward,15,1,1,1,2,4328,3
-1846,alias_default_431,call_function,alias.default,forward,15,1,1,2,3,4327,3
-1847,einsum_default_107,call_function,einsum.default,forward,15,2,2,1,1677,4325,5
-1848,view_351,call_function,view.default,forward,15,1,1,1,1678,4331,4
-1849,view_352,call_function,view.default,forward,15,1,1,1,1678,4331,4
-1850,view_353,call_function,view.default,forward,15,1,1,1,1678,4324,4
-1851,convert_element_type_368,call_function,convert_element_type.default,forward,15,1,1,1,1679,4330,4
-1852,view_354,call_function,view.default,forward,15,1,1,1,1680,4329,4
-1853,view_as_complex_30,call_function,view_as_complex.default,forward,15,1,1,1,1681,4328,6
-1854,convert_element_type_369,call_function,convert_element_type.default,forward,15,1,1,1,1679,4330,4
-1855,view_355,call_function,view.default,forward,15,1,1,1,1680,4329,4
-1856,view_as_complex_31,call_function,view_as_complex.default,forward,15,1,1,1,1681,4328,6
-1857,view_356,call_function,view.default,forward,15,1,1,1,2,4339,3
-1858,alias_default_432,call_function,alias.default,forward,15,1,1,4,3,4338,3
-1859,mul_107,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8
-1860,view_as_real_30,call_function,view_as_real.default,forward,15,1,1,1,1685,4326,6
-1861,view_357,call_function,view.default,forward,15,1,1,1,1686,4325,6
-1862,mul_108,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8
-1863,view_as_real_31,call_function,view_as_real.default,forward,15,1,1,1,1685,4326,6
-1864,view_358,call_function,view.default,forward,15,1,1,1,1686,4325,6
-1865,convert_element_type_370,call_function,convert_element_type.default,forward,15,1,1,1,1687,4324,6
-1866,convert_element_type_371,call_function,convert_element_type.default,forward,15,1,1,1,1687,4324,6
-1867,permute_168,call_function,permute.default,forward,15,1,1,1,1688,4323,6
-1868,permute_169,call_function,permute.default,forward,15,1,1,1,1688,4323,6
-1869,permute_170,call_function,permute.default,forward,15,1,1,1,1679,4323,4
-1870,alias_default_433,call_function,alias.default,forward,15,1,1,2,1689,4322,4
-1871,alias_default_434,call_function,alias.default,forward,15,1,1,2,1689,4322,4
-1872,alias_default_435,call_function,alias.default,forward,15,1,1,2,1680,4322,4
-1873,_scaled_dot_product_flash_attention_15,call_function,_scaled_dot_product_flash_attention.default,forward,15,3,3,4,1713,4321,2
-1874,getitem_135,call_function,getitem,forward,15,1,1,1,1714,4317,2
-1875,getitem_136,call_function,getitem,forward,15,1,1,1,1714,1714,2
-1876,getitem_141,call_function,getitem,forward,15,1,1,1,1714,1714,1
-1877,getitem_142,call_function,getitem,forward,15,1,1,1,1714,1714,1
-1878,alias_default_436,call_function,alias.default,forward,15,1,1,2,1715,4316,4
-1879,permute_171,call_function,permute.default,forward,15,1,1,1,1716,4315,4
-1880,view_359,call_function,view.default,forward,15,1,1,1,1717,4314,3
-1881,dtype_cast_140,call_function,dtype_cast.default,forward,15,1,1,1,1,4316,3
-1882,permute_172,call_function,permute.default,forward,15,1,1,1,2,4315,3
-1883,alias_default_437,call_function,alias.default,forward,15,1,1,2,1718,4313,4
-1884,alias_default_438,call_function,alias.default,forward,15,1,1,2,3,4314,3
-1885,einsum_default_108,call_function,einsum.default,forward,15,2,2,1,1723,4312,5
-1886,add_76,call_function,add.Tensor,forward,15,2,2,1,1724,4311,10
-1887,dtype_cast_141,call_function,dtype_cast.default,forward,15,1,1,1,1,4300,2
-1888,alias_default_439,call_function,alias.default,forward,15,1,1,3,1725,4310,4
-1889,convert_element_type_374,call_function,convert_element_type.default,forward,15,1,1,1,1726,4308,4
-1890,alias_default_441,call_function,alias.default,forward,15,1,1,2,1727,4307,4
-1891,pow_32,call_function,pow.Tensor_Scalar,forward,15,1,1,1,1728,4306,4
-1892,mean_31,call_function,mean.dim,forward,15,1,1,1,1729,4305,4
-1893,add_77,call_function,add.Scalar,forward,15,1,1,1,1730,4304,3
-1894,rsqrt_31,call_function,rsqrt.default,forward,15,1,1,1,1731,4303,3
-1895,alias_default_442,call_function,alias.default,forward,15,1,1,3,1732,4302,3
-1896,mul_109,call_function,mul.Tensor,forward,15,2,2,1,1733,4298,8
-1897,alias_default_440,call_function,alias.default,forward,15,1,1,2,2,4299,2
-1898,mul_110,call_function,mul.Tensor,forward,15,2,2,1,1737,4297,8
-1899,convert_element_type_375,call_function,convert_element_type.default,forward,15,1,1,1,1738,4296,6
-1900,dtype_cast_142,call_function,dtype_cast.default,forward,15,1,1,1,1,4296,3
-1901,permute_173,call_function,permute.default,forward,15,1,1,1,2,4295,3
-1902,alias_default_443,call_function,alias.default,forward,15,1,1,4,1739,4295,4
-1903,alias_default_444,call_function,alias.default,forward,15,1,1,2,3,4294,3
-1904,einsum_default_109,call_function,einsum.default,forward,15,2,2,1,1744,4292,5
-1905,alias_default_445,call_function,alias.default,forward,15,1,1,2,1745,4291,4
-1906,convert_element_type_378,call_function,convert_element_type.default,forward,15,1,1,1,1746,4279,4
-1907,alias_default_446,call_function,alias.default,forward,15,1,1,2,1747,4278,4
-1908,neg_15,call_function,neg.default,forward,15,1,1,1,1748,4277,8
-1909,exp_15,call_function,exp.default,forward,15,1,1,1,1749,4276,6
-1910,add_78,call_function,add.Tensor,forward,15,1,1,1,1750,4275,4
-1911,div_15,call_function,div.Tensor,forward,15,2,2,1,1751,4274,6
-1912,convert_element_type_379,call_function,convert_element_type.default,forward,15,1,1,1,1752,4273,6
-1913,dtype_cast_143,call_function,dtype_cast.default,forward,15,1,1,1,1,4277,3
-1914,permute_174,call_function,permute.default,forward,15,1,1,1,2,4276,3
-1915,alias_default_448,call_function,alias.default,forward,15,1,1,2,3,4275,3
-1916,einsum_default_110,call_function,einsum.default,forward,15,2,2,1,1744,4273,5
-1917,alias_default_447,call_function,alias.default,forward,15,1,1,2,1753,4272,4
-1918,alias_default_449,call_function,alias.default,forward,15,1,1,2,1745,4272,4
-1919,mul_111,call_function,mul.Tensor,forward,15,2,2,1,1760,4271,8
-1920,dtype_cast_144,call_function,dtype_cast.default,forward,15,1,1,1,1,4273,3
-1921,permute_175,call_function,permute.default,forward,15,1,1,1,2,4272,3
-1922,alias_default_450,call_function,alias.default,forward,15,1,1,2,1761,4270,4
-1923,alias_default_451,call_function,alias.default,forward,15,1,1,2,3,4271,3
-1924,einsum_default_111,call_function,einsum.default,forward,15,2,2,1,1766,4269,5
-1925,add_79,call_function,add.Tensor,forward,15,2,2,1,1767,4268,10
-1926,dtype_cast_145,call_function,dtype_cast.default,forward,16,1,1,1,1,4257,2
-1927,alias_default_452,call_function,alias.default,forward,15,1,1,3,1768,4267,4
-1928,convert_element_type_384,call_function,convert_element_type.default,forward,16,1,1,1,1769,4265,4
-1929,alias_default_454,call_function,alias.default,forward,16,1,1,2,1770,4264,4
-1930,pow_33,call_function,pow.Tensor_Scalar,forward,16,1,1,1,1771,4263,4
-1931,mean_32,call_function,mean.dim,forward,16,1,1,1,1772,4262,4
-1932,add_80,call_function,add.Scalar,forward,16,1,1,1,1773,4261,3
-1933,rsqrt_32,call_function,rsqrt.default,forward,16,1,1,1,1774,4260,3
-1934,alias_default_455,call_function,alias.default,forward,16,1,1,3,1775,4259,3
-1935,mul_112,call_function,mul.Tensor,forward,16,2,2,1,1776,4255,8
-1936,alias_default_453,call_function,alias.default,forward,16,1,1,2,2,4256,2
-1937,mul_113,call_function,mul.Tensor,forward,16,2,2,1,1780,4254,8
-1938,convert_element_type_385,call_function,convert_element_type.default,forward,16,1,1,1,1781,4253,6
-1939,dtype_cast_146,call_function,dtype_cast.default,forward,16,1,1,1,1,4240,3
-1940,permute_176,call_function,permute.default,forward,16,1,1,1,2,4239,3
-1941,alias_default_456,call_function,alias.default,forward,16,1,1,6,1782,4252,4
-1942,alias_default_457,call_function,alias.default,forward,16,1,1,2,3,4238,3
-1943,einsum_default_112,call_function,einsum.default,forward,16,2,2,1,1787,4236,5
-1944,dtype_cast_147,call_function,dtype_cast.default,forward,16,1,1,1,1,4240,3
-1945,permute_177,call_function,permute.default,forward,16,1,1,1,2,4239,3
-1946,alias_default_458,call_function,alias.default,forward,16,1,1,2,3,4238,3
-1947,einsum_default_113,call_function,einsum.default,forward,16,2,2,1,1787,4236,5
-1948,dtype_cast_148,call_function,dtype_cast.default,forward,16,1,1,1,1,4233,3
-1949,permute_178,call_function,permute.default,forward,16,1,1,1,2,4232,3
-1950,alias_default_459,call_function,alias.default,forward,16,1,1,2,3,4231,3
-1951,einsum_default_114,call_function,einsum.default,forward,16,2,2,1,1787,4229,5
-1952,view_374,call_function,view.default,forward,16,1,1,1,1788,4235,4
-1953,view_375,call_function,view.default,forward,16,1,1,1,1788,4235,4
-1954,view_376,call_function,view.default,forward,16,1,1,1,1788,4228,4
-1955,convert_element_type_392,call_function,convert_element_type.default,forward,16,1,1,1,1789,4234,4
-1956,view_377,call_function,view.default,forward,16,1,1,1,1790,4233,4
-1957,view_as_complex_32,call_function,view_as_complex.default,forward,16,1,1,1,1791,4232,6
-1958,convert_element_type_393,call_function,convert_element_type.default,forward,16,1,1,1,1789,4234,4
-1959,view_378,call_function,view.default,forward,16,1,1,1,1790,4233,4
-1960,view_as_complex_33,call_function,view_as_complex.default,forward,16,1,1,1,1791,4232,6
-1961,view_379,call_function,view.default,forward,16,1,1,1,2,4243,3
-1962,alias_default_460,call_function,alias.default,forward,16,1,1,4,3,4242,3
-1963,mul_114,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8
-1964,view_as_real_32,call_function,view_as_real.default,forward,16,1,1,1,1795,4230,6
-1965,view_380,call_function,view.default,forward,16,1,1,1,1796,4229,6
-1966,mul_115,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8
-1967,view_as_real_33,call_function,view_as_real.default,forward,16,1,1,1,1795,4230,6
-1968,view_381,call_function,view.default,forward,16,1,1,1,1796,4229,6
-1969,convert_element_type_394,call_function,convert_element_type.default,forward,16,1,1,1,1797,4228,6
-1970,convert_element_type_395,call_function,convert_element_type.default,forward,16,1,1,1,1797,4228,6
-1971,permute_179,call_function,permute.default,forward,16,1,1,1,1798,4227,6
-1972,permute_180,call_function,permute.default,forward,16,1,1,1,1798,4227,6
-1973,permute_181,call_function,permute.default,forward,16,1,1,1,1789,4227,4
-1974,alias_default_461,call_function,alias.default,forward,16,1,1,2,1799,4226,4
-1975,alias_default_462,call_function,alias.default,forward,16,1,1,2,1799,4226,4
-1976,alias_default_463,call_function,alias.default,forward,16,1,1,2,1790,4226,4
-1977,_scaled_dot_product_flash_attention_16,call_function,_scaled_dot_product_flash_attention.default,forward,16,3,3,4,1823,4225,2
-1978,getitem_144,call_function,getitem,forward,16,1,1,1,1824,4221,2
-1979,getitem_145,call_function,getitem,forward,16,1,1,1,1824,1824,2
-1980,getitem_150,call_function,getitem,forward,16,1,1,1,1824,1824,1
-1981,getitem_151,call_function,getitem,forward,16,1,1,1,1824,1824,1
-1982,alias_default_464,call_function,alias.default,forward,16,1,1,2,1825,4220,4
-1983,permute_182,call_function,permute.default,forward,16,1,1,1,1826,4219,4
-1984,view_382,call_function,view.default,forward,16,1,1,1,1827,4218,3
-1985,dtype_cast_149,call_function,dtype_cast.default,forward,16,1,1,1,1,4220,3
-1986,permute_183,call_function,permute.default,forward,16,1,1,1,2,4219,3
-1987,alias_default_465,call_function,alias.default,forward,16,1,1,2,1828,4217,4
-1988,alias_default_466,call_function,alias.default,forward,16,1,1,2,3,4218,3
-1989,einsum_default_115,call_function,einsum.default,forward,16,2,2,1,1833,4216,5
-1990,add_81,call_function,add.Tensor,forward,16,2,2,1,1834,4215,10
-1991,dtype_cast_150,call_function,dtype_cast.default,forward,16,1,1,1,1,4204,2
-1992,alias_default_467,call_function,alias.default,forward,16,1,1,3,1835,4214,4
-1993,convert_element_type_398,call_function,convert_element_type.default,forward,16,1,1,1,1836,4212,4
-1994,alias_default_469,call_function,alias.default,forward,16,1,1,2,1837,4211,4
-1995,pow_34,call_function,pow.Tensor_Scalar,forward,16,1,1,1,1838,4210,4
-1996,mean_33,call_function,mean.dim,forward,16,1,1,1,1839,4209,4
-1997,add_82,call_function,add.Scalar,forward,16,1,1,1,1840,4208,3
-1998,rsqrt_33,call_function,rsqrt.default,forward,16,1,1,1,1841,4207,3
-1999,alias_default_470,call_function,alias.default,forward,16,1,1,3,1842,4206,3
-2000,mul_116,call_function,mul.Tensor,forward,16,2,2,1,1843,4202,8
-2001,alias_default_468,call_function,alias.default,forward,16,1,1,2,2,4203,2
-2002,mul_117,call_function,mul.Tensor,forward,16,2,2,1,1847,4201,8
-2003,convert_element_type_399,call_function,convert_element_type.default,forward,16,1,1,1,1848,4200,6
-2004,dtype_cast_151,call_function,dtype_cast.default,forward,16,1,1,1,1,4200,3
-2005,permute_184,call_function,permute.default,forward,16,1,1,1,2,4199,3
-2006,alias_default_471,call_function,alias.default,forward,16,1,1,4,1849,4199,4
-2007,alias_default_472,call_function,alias.default,forward,16,1,1,2,3,4198,3
-2008,einsum_default_116,call_function,einsum.default,forward,16,2,2,1,1854,4196,5
-2009,alias_default_473,call_function,alias.default,forward,16,1,1,2,1855,4195,4
-2010,convert_element_type_402,call_function,convert_element_type.default,forward,16,1,1,1,1856,4183,4
-2011,alias_default_474,call_function,alias.default,forward,16,1,1,2,1857,4182,4
-2012,neg_16,call_function,neg.default,forward,16,1,1,1,1858,4181,8
-2013,exp_16,call_function,exp.default,forward,16,1,1,1,1859,4180,6
-2014,add_83,call_function,add.Tensor,forward,16,1,1,1,1860,4179,4
-2015,div_16,call_function,div.Tensor,forward,16,2,2,1,1861,4178,6
-2016,convert_element_type_403,call_function,convert_element_type.default,forward,16,1,1,1,1862,4177,6
-2017,dtype_cast_152,call_function,dtype_cast.default,forward,16,1,1,1,1,4181,3
-2018,permute_185,call_function,permute.default,forward,16,1,1,1,2,4180,3
-2019,alias_default_476,call_function,alias.default,forward,16,1,1,2,3,4179,3
-2020,einsum_default_117,call_function,einsum.default,forward,16,2,2,1,1854,4177,5
-2021,alias_default_475,call_function,alias.default,forward,16,1,1,2,1863,4176,4
-2022,alias_default_477,call_function,alias.default,forward,16,1,1,2,1855,4176,4
-2023,mul_118,call_function,mul.Tensor,forward,16,2,2,1,1870,4175,8
-2024,dtype_cast_153,call_function,dtype_cast.default,forward,16,1,1,1,1,4177,3
-2025,permute_186,call_function,permute.default,forward,16,1,1,1,2,4176,3
-2026,alias_default_478,call_function,alias.default,forward,16,1,1,2,1871,4174,4
-2027,alias_default_479,call_function,alias.default,forward,16,1,1,2,3,4175,3
-2028,einsum_default_118,call_function,einsum.default,forward,16,2,2,1,1876,4173,5
-2029,add_84,call_function,add.Tensor,forward,16,2,2,1,1877,4172,10
-2030,dtype_cast_154,call_function,dtype_cast.default,forward,17,1,1,1,1,4161,2
-2031,alias_default_480,call_function,alias.default,forward,16,1,1,3,1878,4171,4
-2032,convert_element_type_408,call_function,convert_element_type.default,forward,17,1,1,1,1879,4169,4
-2033,alias_default_482,call_function,alias.default,forward,17,1,1,2,1880,4168,4
-2034,pow_35,call_function,pow.Tensor_Scalar,forward,17,1,1,1,1881,4167,4
-2035,mean_34,call_function,mean.dim,forward,17,1,1,1,1882,4166,4
-2036,add_85,call_function,add.Scalar,forward,17,1,1,1,1883,4165,3
-2037,rsqrt_34,call_function,rsqrt.default,forward,17,1,1,1,1884,4164,3
-2038,alias_default_483,call_function,alias.default,forward,17,1,1,3,1885,4163,3
-2039,mul_119,call_function,mul.Tensor,forward,17,2,2,1,1886,4159,8
-2040,alias_default_481,call_function,alias.default,forward,17,1,1,2,2,4160,2
-2041,mul_120,call_function,mul.Tensor,forward,17,2,2,1,1890,4158,8
-2042,convert_element_type_409,call_function,convert_element_type.default,forward,17,1,1,1,1891,4157,6
-2043,dtype_cast_155,call_function,dtype_cast.default,forward,17,1,1,1,1,4144,3
-2044,permute_187,call_function,permute.default,forward,17,1,1,1,2,4143,3
-2045,alias_default_484,call_function,alias.default,forward,17,1,1,6,1892,4156,4
-2046,alias_default_485,call_function,alias.default,forward,17,1,1,2,3,4142,3
-2047,einsum_default_119,call_function,einsum.default,forward,17,2,2,1,1897,4140,5
-2048,dtype_cast_156,call_function,dtype_cast.default,forward,17,1,1,1,1,4144,3
-2049,permute_188,call_function,permute.default,forward,17,1,1,1,2,4143,3
-2050,alias_default_486,call_function,alias.default,forward,17,1,1,2,3,4142,3
-2051,einsum_default_120,call_function,einsum.default,forward,17,2,2,1,1897,4140,5
-2052,dtype_cast_157,call_function,dtype_cast.default,forward,17,1,1,1,1,4137,3
-2053,permute_189,call_function,permute.default,forward,17,1,1,1,2,4136,3
-2054,alias_default_487,call_function,alias.default,forward,17,1,1,2,3,4135,3
-2055,einsum_default_121,call_function,einsum.default,forward,17,2,2,1,1897,4133,5
-2056,view_397,call_function,view.default,forward,17,1,1,1,1898,4139,4
-2057,view_398,call_function,view.default,forward,17,1,1,1,1898,4139,4
-2058,view_399,call_function,view.default,forward,17,1,1,1,1898,4132,4
-2059,convert_element_type_416,call_function,convert_element_type.default,forward,17,1,1,1,1899,4138,4
-2060,view_400,call_function,view.default,forward,17,1,1,1,1900,4137,4
-2061,view_as_complex_34,call_function,view_as_complex.default,forward,17,1,1,1,1901,4136,6
-2062,convert_element_type_417,call_function,convert_element_type.default,forward,17,1,1,1,1899,4138,4
-2063,view_401,call_function,view.default,forward,17,1,1,1,1900,4137,4
-2064,view_as_complex_35,call_function,view_as_complex.default,forward,17,1,1,1,1901,4136,6
-2065,view_402,call_function,view.default,forward,17,1,1,1,2,4147,3
-2066,alias_default_488,call_function,alias.default,forward,17,1,1,4,3,4146,3
-2067,mul_121,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8
-2068,view_as_real_34,call_function,view_as_real.default,forward,17,1,1,1,1905,4134,6
-2069,view_403,call_function,view.default,forward,17,1,1,1,1906,4133,6
-2070,mul_122,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8
-2071,view_as_real_35,call_function,view_as_real.default,forward,17,1,1,1,1905,4134,6
-2072,view_404,call_function,view.default,forward,17,1,1,1,1906,4133,6
-2073,convert_element_type_418,call_function,convert_element_type.default,forward,17,1,1,1,1907,4132,6
-2074,convert_element_type_419,call_function,convert_element_type.default,forward,17,1,1,1,1907,4132,6
-2075,permute_190,call_function,permute.default,forward,17,1,1,1,1908,4131,6
-2076,permute_191,call_function,permute.default,forward,17,1,1,1,1908,4131,6
-2077,permute_192,call_function,permute.default,forward,17,1,1,1,1899,4131,4
-2078,alias_default_489,call_function,alias.default,forward,17,1,1,2,1909,4130,4
-2079,alias_default_490,call_function,alias.default,forward,17,1,1,2,1909,4130,4
-2080,alias_default_491,call_function,alias.default,forward,17,1,1,2,1900,4130,4
-2081,_scaled_dot_product_flash_attention_17,call_function,_scaled_dot_product_flash_attention.default,forward,17,3,3,4,1933,4129,2
-2082,getitem_153,call_function,getitem,forward,17,1,1,1,1934,4125,2
-2083,getitem_154,call_function,getitem,forward,17,1,1,1,1934,1934,2
-2084,getitem_159,call_function,getitem,forward,17,1,1,1,1934,1934,1
-2085,getitem_160,call_function,getitem,forward,17,1,1,1,1934,1934,1
-2086,alias_default_492,call_function,alias.default,forward,17,1,1,2,1935,4124,4
-2087,permute_193,call_function,permute.default,forward,17,1,1,1,1936,4123,4
-2088,view_405,call_function,view.default,forward,17,1,1,1,1937,4122,3
-2089,dtype_cast_158,call_function,dtype_cast.default,forward,17,1,1,1,1,4124,3
-2090,permute_194,call_function,permute.default,forward,17,1,1,1,2,4123,3
-2091,alias_default_493,call_function,alias.default,forward,17,1,1,2,1938,4121,4
-2092,alias_default_494,call_function,alias.default,forward,17,1,1,2,3,4122,3
-2093,einsum_default_122,call_function,einsum.default,forward,17,2,2,1,1943,4120,5
-2094,add_86,call_function,add.Tensor,forward,17,2,2,1,1944,4119,10
-2095,dtype_cast_159,call_function,dtype_cast.default,forward,17,1,1,1,1,4108,2
-2096,alias_default_495,call_function,alias.default,forward,17,1,1,3,1945,4118,4
-2097,convert_element_type_422,call_function,convert_element_type.default,forward,17,1,1,1,1946,4116,4
-2098,alias_default_497,call_function,alias.default,forward,17,1,1,2,1947,4115,4
-2099,pow_36,call_function,pow.Tensor_Scalar,forward,17,1,1,1,1948,4114,4
-2100,mean_35,call_function,mean.dim,forward,17,1,1,1,1949,4113,4
-2101,add_87,call_function,add.Scalar,forward,17,1,1,1,1950,4112,3
-2102,rsqrt_35,call_function,rsqrt.default,forward,17,1,1,1,1951,4111,3
-2103,alias_default_498,call_function,alias.default,forward,17,1,1,3,1952,4110,3
-2104,mul_123,call_function,mul.Tensor,forward,17,2,2,1,1953,4106,8
-2105,alias_default_496,call_function,alias.default,forward,17,1,1,2,2,4107,2
-2106,mul_124,call_function,mul.Tensor,forward,17,2,2,1,1957,4105,8
-2107,convert_element_type_423,call_function,convert_element_type.default,forward,17,1,1,1,1958,4104,6
-2108,dtype_cast_160,call_function,dtype_cast.default,forward,17,1,1,1,1,4104,3
-2109,permute_195,call_function,permute.default,forward,17,1,1,1,2,4103,3
-2110,alias_default_499,call_function,alias.default,forward,17,1,1,4,1959,4103,4
-2111,alias_default_500,call_function,alias.default,forward,17,1,1,2,3,4102,3
-2112,einsum_default_123,call_function,einsum.default,forward,17,2,2,1,1964,4100,5
-2113,alias_default_501,call_function,alias.default,forward,17,1,1,2,1965,4099,4
-2114,convert_element_type_426,call_function,convert_element_type.default,forward,17,1,1,1,1966,4087,4
-2115,alias_default_502,call_function,alias.default,forward,17,1,1,2,1967,4086,4
-2116,neg_17,call_function,neg.default,forward,17,1,1,1,1968,4085,8
-2117,exp_17,call_function,exp.default,forward,17,1,1,1,1969,4084,6
-2118,add_88,call_function,add.Tensor,forward,17,1,1,1,1970,4083,4
-2119,div_17,call_function,div.Tensor,forward,17,2,2,1,1971,4082,6
-2120,convert_element_type_427,call_function,convert_element_type.default,forward,17,1,1,1,1972,4081,6
-2121,dtype_cast_161,call_function,dtype_cast.default,forward,17,1,1,1,1,4085,3
-2122,permute_196,call_function,permute.default,forward,17,1,1,1,2,4084,3
-2123,alias_default_504,call_function,alias.default,forward,17,1,1,2,3,4083,3
-2124,einsum_default_124,call_function,einsum.default,forward,17,2,2,1,1964,4081,5
-2125,alias_default_503,call_function,alias.default,forward,17,1,1,2,1973,4080,4
-2126,alias_default_505,call_function,alias.default,forward,17,1,1,2,1965,4080,4
-2127,mul_125,call_function,mul.Tensor,forward,17,2,2,1,1980,4079,8
-2128,dtype_cast_162,call_function,dtype_cast.default,forward,17,1,1,1,1,4081,3
-2129,permute_197,call_function,permute.default,forward,17,1,1,1,2,4080,3
-2130,alias_default_506,call_function,alias.default,forward,17,1,1,2,1981,4078,4
-2131,alias_default_507,call_function,alias.default,forward,17,1,1,2,3,4079,3
-2132,einsum_default_125,call_function,einsum.default,forward,17,2,2,1,1986,4077,5
-2133,add_89,call_function,add.Tensor,forward,17,2,2,1,1987,4076,10
-2134,dtype_cast_163,call_function,dtype_cast.default,forward,18,1,1,1,1,4065,2
-2135,alias_default_508,call_function,alias.default,forward,17,1,1,3,1988,4075,4
-2136,convert_element_type_432,call_function,convert_element_type.default,forward,18,1,1,1,1989,4073,4
-2137,alias_default_510,call_function,alias.default,forward,18,1,1,2,1990,4072,4
-2138,pow_37,call_function,pow.Tensor_Scalar,forward,18,1,1,1,1991,4071,4
-2139,mean_36,call_function,mean.dim,forward,18,1,1,1,1992,4070,4
-2140,add_90,call_function,add.Scalar,forward,18,1,1,1,1993,4069,3
-2141,rsqrt_36,call_function,rsqrt.default,forward,18,1,1,1,1994,4068,3
-2142,alias_default_511,call_function,alias.default,forward,18,1,1,3,1995,4067,3
-2143,mul_126,call_function,mul.Tensor,forward,18,2,2,1,1996,4063,8
-2144,alias_default_509,call_function,alias.default,forward,18,1,1,2,2,4064,2
-2145,mul_127,call_function,mul.Tensor,forward,18,2,2,1,2000,4062,8
-2146,convert_element_type_433,call_function,convert_element_type.default,forward,18,1,1,1,2001,4061,6
-2147,dtype_cast_164,call_function,dtype_cast.default,forward,18,1,1,1,1,4048,3
-2148,permute_198,call_function,permute.default,forward,18,1,1,1,2,4047,3
-2149,alias_default_512,call_function,alias.default,forward,18,1,1,6,2002,4060,4
-2150,alias_default_513,call_function,alias.default,forward,18,1,1,2,3,4046,3
-2151,einsum_default_126,call_function,einsum.default,forward,18,2,2,1,2007,4044,5
-2152,dtype_cast_165,call_function,dtype_cast.default,forward,18,1,1,1,1,4048,3
-2153,permute_199,call_function,permute.default,forward,18,1,1,1,2,4047,3
-2154,alias_default_514,call_function,alias.default,forward,18,1,1,2,3,4046,3
-2155,einsum_default_127,call_function,einsum.default,forward,18,2,2,1,2007,4044,5
-2156,dtype_cast_166,call_function,dtype_cast.default,forward,18,1,1,1,1,4041,3
-2157,permute_200,call_function,permute.default,forward,18,1,1,1,2,4040,3
-2158,alias_default_515,call_function,alias.default,forward,18,1,1,2,3,4039,3
-2159,einsum_default_128,call_function,einsum.default,forward,18,2,2,1,2007,4037,5
-2160,view_420,call_function,view.default,forward,18,1,1,1,2008,4043,4
-2161,view_421,call_function,view.default,forward,18,1,1,1,2008,4043,4
-2162,view_422,call_function,view.default,forward,18,1,1,1,2008,4036,4
-2163,convert_element_type_440,call_function,convert_element_type.default,forward,18,1,1,1,2009,4042,4
-2164,view_423,call_function,view.default,forward,18,1,1,1,2010,4041,4
-2165,view_as_complex_36,call_function,view_as_complex.default,forward,18,1,1,1,2011,4040,6
-2166,convert_element_type_441,call_function,convert_element_type.default,forward,18,1,1,1,2009,4042,4
-2167,view_424,call_function,view.default,forward,18,1,1,1,2010,4041,4
-2168,view_as_complex_37,call_function,view_as_complex.default,forward,18,1,1,1,2011,4040,6
-2169,view_425,call_function,view.default,forward,18,1,1,1,2,4051,3
-2170,alias_default_516,call_function,alias.default,forward,18,1,1,4,3,4050,3
-2171,mul_128,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8
-2172,view_as_real_36,call_function,view_as_real.default,forward,18,1,1,1,2015,4038,6
-2173,view_426,call_function,view.default,forward,18,1,1,1,2016,4037,6
-2174,mul_129,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8
-2175,view_as_real_37,call_function,view_as_real.default,forward,18,1,1,1,2015,4038,6
-2176,view_427,call_function,view.default,forward,18,1,1,1,2016,4037,6
-2177,convert_element_type_442,call_function,convert_element_type.default,forward,18,1,1,1,2017,4036,6
-2178,convert_element_type_443,call_function,convert_element_type.default,forward,18,1,1,1,2017,4036,6
-2179,permute_201,call_function,permute.default,forward,18,1,1,1,2018,4035,6
-2180,permute_202,call_function,permute.default,forward,18,1,1,1,2018,4035,6
-2181,permute_203,call_function,permute.default,forward,18,1,1,1,2009,4035,4
-2182,alias_default_517,call_function,alias.default,forward,18,1,1,2,2019,4034,4
-2183,alias_default_518,call_function,alias.default,forward,18,1,1,2,2019,4034,4
-2184,alias_default_519,call_function,alias.default,forward,18,1,1,2,2010,4034,4
-2185,_scaled_dot_product_flash_attention_18,call_function,_scaled_dot_product_flash_attention.default,forward,18,3,3,4,2043,4033,2
-2186,getitem_162,call_function,getitem,forward,18,1,1,1,2044,4029,2
-2187,getitem_163,call_function,getitem,forward,18,1,1,1,2044,2044,2
-2188,getitem_168,call_function,getitem,forward,18,1,1,1,2044,2044,1
-2189,getitem_169,call_function,getitem,forward,18,1,1,1,2044,2044,1
-2190,alias_default_520,call_function,alias.default,forward,18,1,1,2,2045,4028,4
-2191,permute_204,call_function,permute.default,forward,18,1,1,1,2046,4027,4
-2192,view_428,call_function,view.default,forward,18,1,1,1,2047,4026,3
-2193,dtype_cast_167,call_function,dtype_cast.default,forward,18,1,1,1,1,4028,3
-2194,permute_205,call_function,permute.default,forward,18,1,1,1,2,4027,3
-2195,alias_default_521,call_function,alias.default,forward,18,1,1,2,2048,4025,4
-2196,alias_default_522,call_function,alias.default,forward,18,1,1,2,3,4026,3
-2197,einsum_default_129,call_function,einsum.default,forward,18,2,2,1,2053,4024,5
-2198,add_91,call_function,add.Tensor,forward,18,2,2,1,2054,4023,10
-2199,dtype_cast_168,call_function,dtype_cast.default,forward,18,1,1,1,1,4012,2
-2200,alias_default_523,call_function,alias.default,forward,18,1,1,3,2055,4022,4
-2201,convert_element_type_446,call_function,convert_element_type.default,forward,18,1,1,1,2056,4020,4
-2202,alias_default_525,call_function,alias.default,forward,18,1,1,2,2057,4019,4
-2203,pow_38,call_function,pow.Tensor_Scalar,forward,18,1,1,1,2058,4018,4
-2204,mean_37,call_function,mean.dim,forward,18,1,1,1,2059,4017,4
-2205,add_92,call_function,add.Scalar,forward,18,1,1,1,2060,4016,3
-2206,rsqrt_37,call_function,rsqrt.default,forward,18,1,1,1,2061,4015,3
-2207,alias_default_526,call_function,alias.default,forward,18,1,1,3,2062,4014,3
-2208,mul_130,call_function,mul.Tensor,forward,18,2,2,1,2063,4010,8
-2209,alias_default_524,call_function,alias.default,forward,18,1,1,2,2,4011,2
-2210,mul_131,call_function,mul.Tensor,forward,18,2,2,1,2067,4009,8
-2211,convert_element_type_447,call_function,convert_element_type.default,forward,18,1,1,1,2068,4008,6
-2212,dtype_cast_169,call_function,dtype_cast.default,forward,18,1,1,1,1,4008,3
-2213,permute_206,call_function,permute.default,forward,18,1,1,1,2,4007,3
-2214,alias_default_527,call_function,alias.default,forward,18,1,1,4,2069,4007,4
-2215,alias_default_528,call_function,alias.default,forward,18,1,1,2,3,4006,3
-2216,einsum_default_130,call_function,einsum.default,forward,18,2,2,1,2074,4004,5
-2217,alias_default_529,call_function,alias.default,forward,18,1,1,2,2075,4003,4
-2218,convert_element_type_450,call_function,convert_element_type.default,forward,18,1,1,1,2076,3991,4
-2219,alias_default_530,call_function,alias.default,forward,18,1,1,2,2077,3990,4
-2220,neg_18,call_function,neg.default,forward,18,1,1,1,2078,3989,8
-2221,exp_18,call_function,exp.default,forward,18,1,1,1,2079,3988,6
-2222,add_93,call_function,add.Tensor,forward,18,1,1,1,2080,3987,4
-2223,div_18,call_function,div.Tensor,forward,18,2,2,1,2081,3986,6
-2224,convert_element_type_451,call_function,convert_element_type.default,forward,18,1,1,1,2082,3985,6
-2225,dtype_cast_170,call_function,dtype_cast.default,forward,18,1,1,1,1,3989,3
-2226,permute_207,call_function,permute.default,forward,18,1,1,1,2,3988,3
-2227,alias_default_532,call_function,alias.default,forward,18,1,1,2,3,3987,3
-2228,einsum_default_131,call_function,einsum.default,forward,18,2,2,1,2074,3985,5
-2229,alias_default_531,call_function,alias.default,forward,18,1,1,2,2083,3984,4
-2230,alias_default_533,call_function,alias.default,forward,18,1,1,2,2075,3984,4
-2231,mul_132,call_function,mul.Tensor,forward,18,2,2,1,2090,3983,8
-2232,dtype_cast_171,call_function,dtype_cast.default,forward,18,1,1,1,1,3985,3
-2233,permute_208,call_function,permute.default,forward,18,1,1,1,2,3984,3
-2234,alias_default_534,call_function,alias.default,forward,18,1,1,2,2091,3982,4
-2235,alias_default_535,call_function,alias.default,forward,18,1,1,2,3,3983,3
-2236,einsum_default_132,call_function,einsum.default,forward,18,2,2,1,2096,3981,5
-2237,add_94,call_function,add.Tensor,forward,18,2,2,1,2097,3980,10
-2238,dtype_cast_172,call_function,dtype_cast.default,forward,19,1,1,1,1,3969,2
-2239,alias_default_536,call_function,alias.default,forward,18,1,1,3,2098,3979,4
-2240,convert_element_type_456,call_function,convert_element_type.default,forward,19,1,1,1,2099,3977,4
-2241,alias_default_538,call_function,alias.default,forward,19,1,1,2,2100,3976,4
-2242,pow_39,call_function,pow.Tensor_Scalar,forward,19,1,1,1,2101,3975,4
-2243,mean_38,call_function,mean.dim,forward,19,1,1,1,2102,3974,4
-2244,add_95,call_function,add.Scalar,forward,19,1,1,1,2103,3973,3
-2245,rsqrt_38,call_function,rsqrt.default,forward,19,1,1,1,2104,3972,3
-2246,alias_default_539,call_function,alias.default,forward,19,1,1,3,2105,3971,3
-2247,mul_133,call_function,mul.Tensor,forward,19,2,2,1,2106,3967,8
-2248,alias_default_537,call_function,alias.default,forward,19,1,1,2,2,3968,2
-2249,mul_134,call_function,mul.Tensor,forward,19,2,2,1,2110,3966,8
-2250,convert_element_type_457,call_function,convert_element_type.default,forward,19,1,1,1,2111,3965,6
-2251,dtype_cast_173,call_function,dtype_cast.default,forward,19,1,1,1,1,3952,3
-2252,permute_209,call_function,permute.default,forward,19,1,1,1,2,3951,3
-2253,alias_default_540,call_function,alias.default,forward,19,1,1,6,2112,3964,4
-2254,alias_default_541,call_function,alias.default,forward,19,1,1,2,3,3950,3
-2255,einsum_default_133,call_function,einsum.default,forward,19,2,2,1,2117,3948,5
-2256,dtype_cast_174,call_function,dtype_cast.default,forward,19,1,1,1,1,3952,3
-2257,permute_210,call_function,permute.default,forward,19,1,1,1,2,3951,3
-2258,alias_default_542,call_function,alias.default,forward,19,1,1,2,3,3950,3
-2259,einsum_default_134,call_function,einsum.default,forward,19,2,2,1,2117,3948,5
-2260,dtype_cast_175,call_function,dtype_cast.default,forward,19,1,1,1,1,3945,3
-2261,permute_211,call_function,permute.default,forward,19,1,1,1,2,3944,3
-2262,alias_default_543,call_function,alias.default,forward,19,1,1,2,3,3943,3
-2263,einsum_default_135,call_function,einsum.default,forward,19,2,2,1,2117,3941,5
-2264,view_443,call_function,view.default,forward,19,1,1,1,2118,3947,4
-2265,view_444,call_function,view.default,forward,19,1,1,1,2118,3947,4
-2266,view_445,call_function,view.default,forward,19,1,1,1,2118,3940,4
-2267,convert_element_type_464,call_function,convert_element_type.default,forward,19,1,1,1,2119,3946,4
-2268,view_446,call_function,view.default,forward,19,1,1,1,2120,3945,4
-2269,view_as_complex_38,call_function,view_as_complex.default,forward,19,1,1,1,2121,3944,6
-2270,convert_element_type_465,call_function,convert_element_type.default,forward,19,1,1,1,2119,3946,4
-2271,view_447,call_function,view.default,forward,19,1,1,1,2120,3945,4
-2272,view_as_complex_39,call_function,view_as_complex.default,forward,19,1,1,1,2121,3944,6
-2273,view_448,call_function,view.default,forward,19,1,1,1,2,3955,3
-2274,alias_default_544,call_function,alias.default,forward,19,1,1,4,3,3954,3
-2275,mul_135,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8
-2276,view_as_real_38,call_function,view_as_real.default,forward,19,1,1,1,2125,3942,6
-2277,view_449,call_function,view.default,forward,19,1,1,1,2126,3941,6
-2278,mul_136,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8
-2279,view_as_real_39,call_function,view_as_real.default,forward,19,1,1,1,2125,3942,6
-2280,view_450,call_function,view.default,forward,19,1,1,1,2126,3941,6
-2281,convert_element_type_466,call_function,convert_element_type.default,forward,19,1,1,1,2127,3940,6
-2282,convert_element_type_467,call_function,convert_element_type.default,forward,19,1,1,1,2127,3940,6
-2283,permute_212,call_function,permute.default,forward,19,1,1,1,2128,3939,6
-2284,permute_213,call_function,permute.default,forward,19,1,1,1,2128,3939,6
-2285,permute_214,call_function,permute.default,forward,19,1,1,1,2119,3939,4
-2286,alias_default_545,call_function,alias.default,forward,19,1,1,2,2129,3938,4
-2287,alias_default_546,call_function,alias.default,forward,19,1,1,2,2129,3938,4
-2288,alias_default_547,call_function,alias.default,forward,19,1,1,2,2120,3938,4
-2289,_scaled_dot_product_flash_attention_19,call_function,_scaled_dot_product_flash_attention.default,forward,19,3,3,4,2153,3937,2
-2290,getitem_171,call_function,getitem,forward,19,1,1,1,2154,3933,2
-2291,getitem_172,call_function,getitem,forward,19,1,1,1,2154,2154,2
-2292,getitem_177,call_function,getitem,forward,19,1,1,1,2154,2154,1
-2293,getitem_178,call_function,getitem,forward,19,1,1,1,2154,2154,1
-2294,alias_default_548,call_function,alias.default,forward,19,1,1,2,2155,3932,4
-2295,permute_215,call_function,permute.default,forward,19,1,1,1,2156,3931,4
-2296,view_451,call_function,view.default,forward,19,1,1,1,2157,3930,3
-2297,dtype_cast_176,call_function,dtype_cast.default,forward,19,1,1,1,1,3932,3
-2298,permute_216,call_function,permute.default,forward,19,1,1,1,2,3931,3
-2299,alias_default_549,call_function,alias.default,forward,19,1,1,2,2158,3929,4
-2300,alias_default_550,call_function,alias.default,forward,19,1,1,2,3,3930,3
-2301,einsum_default_136,call_function,einsum.default,forward,19,2,2,1,2163,3928,5
-2302,add_96,call_function,add.Tensor,forward,19,2,2,1,2164,3927,10
-2303,dtype_cast_177,call_function,dtype_cast.default,forward,19,1,1,1,1,3916,2
-2304,alias_default_551,call_function,alias.default,forward,19,1,1,3,2165,3926,4
-2305,convert_element_type_470,call_function,convert_element_type.default,forward,19,1,1,1,2166,3924,4
-2306,alias_default_553,call_function,alias.default,forward,19,1,1,2,2167,3923,4
-2307,pow_40,call_function,pow.Tensor_Scalar,forward,19,1,1,1,2168,3922,4
-2308,mean_39,call_function,mean.dim,forward,19,1,1,1,2169,3921,4
-2309,add_97,call_function,add.Scalar,forward,19,1,1,1,2170,3920,3
-2310,rsqrt_39,call_function,rsqrt.default,forward,19,1,1,1,2171,3919,3
-2311,alias_default_554,call_function,alias.default,forward,19,1,1,3,2172,3918,3
-2312,mul_137,call_function,mul.Tensor,forward,19,2,2,1,2173,3914,8
-2313,alias_default_552,call_function,alias.default,forward,19,1,1,2,2,3915,2
-2314,mul_138,call_function,mul.Tensor,forward,19,2,2,1,2177,3913,8
-2315,convert_element_type_471,call_function,convert_element_type.default,forward,19,1,1,1,2178,3912,6
-2316,dtype_cast_178,call_function,dtype_cast.default,forward,19,1,1,1,1,3912,3
-2317,permute_217,call_function,permute.default,forward,19,1,1,1,2,3911,3
-2318,alias_default_555,call_function,alias.default,forward,19,1,1,4,2179,3911,4
-2319,alias_default_556,call_function,alias.default,forward,19,1,1,2,3,3910,3
-2320,einsum_default_137,call_function,einsum.default,forward,19,2,2,1,2184,3908,5
-2321,alias_default_557,call_function,alias.default,forward,19,1,1,2,2185,3907,4
-2322,convert_element_type_474,call_function,convert_element_type.default,forward,19,1,1,1,2186,3895,4
-2323,alias_default_558,call_function,alias.default,forward,19,1,1,2,2187,3894,4
-2324,neg_19,call_function,neg.default,forward,19,1,1,1,2188,3893,8
-2325,exp_19,call_function,exp.default,forward,19,1,1,1,2189,3892,6
-2326,add_98,call_function,add.Tensor,forward,19,1,1,1,2190,3891,4
-2327,div_19,call_function,div.Tensor,forward,19,2,2,1,2191,3890,6
-2328,convert_element_type_475,call_function,convert_element_type.default,forward,19,1,1,1,2192,3889,6
-2329,dtype_cast_179,call_function,dtype_cast.default,forward,19,1,1,1,1,3893,3
-2330,permute_218,call_function,permute.default,forward,19,1,1,1,2,3892,3
-2331,alias_default_560,call_function,alias.default,forward,19,1,1,2,3,3891,3
-2332,einsum_default_138,call_function,einsum.default,forward,19,2,2,1,2184,3889,5
-2333,alias_default_559,call_function,alias.default,forward,19,1,1,2,2193,3888,4
-2334,alias_default_561,call_function,alias.default,forward,19,1,1,2,2185,3888,4
-2335,mul_139,call_function,mul.Tensor,forward,19,2,2,1,2200,3887,8
-2336,dtype_cast_180,call_function,dtype_cast.default,forward,19,1,1,1,1,3889,3
-2337,permute_219,call_function,permute.default,forward,19,1,1,1,2,3888,3
-2338,alias_default_562,call_function,alias.default,forward,19,1,1,2,2201,3886,4
-2339,alias_default_563,call_function,alias.default,forward,19,1,1,2,3,3887,3
-2340,einsum_default_139,call_function,einsum.default,forward,19,2,2,1,2206,3885,5
-2341,add_99,call_function,add.Tensor,forward,19,2,2,1,2207,3884,10
-2342,dtype_cast_181,call_function,dtype_cast.default,forward,20,1,1,1,1,3873,2
-2343,alias_default_564,call_function,alias.default,forward,19,1,1,3,2208,3883,4
-2344,convert_element_type_480,call_function,convert_element_type.default,forward,20,1,1,1,2209,3881,4
-2345,alias_default_566,call_function,alias.default,forward,20,1,1,2,2210,3880,4
-2346,pow_41,call_function,pow.Tensor_Scalar,forward,20,1,1,1,2211,3879,4
-2347,mean_40,call_function,mean.dim,forward,20,1,1,1,2212,3878,4
-2348,add_100,call_function,add.Scalar,forward,20,1,1,1,2213,3877,3
-2349,rsqrt_40,call_function,rsqrt.default,forward,20,1,1,1,2214,3876,3
-2350,alias_default_567,call_function,alias.default,forward,20,1,1,3,2215,3875,3
-2351,mul_140,call_function,mul.Tensor,forward,20,2,2,1,2216,3871,8
-2352,alias_default_565,call_function,alias.default,forward,20,1,1,2,2,3872,2
-2353,mul_141,call_function,mul.Tensor,forward,20,2,2,1,2220,3870,8
-2354,convert_element_type_481,call_function,convert_element_type.default,forward,20,1,1,1,2221,3869,6
-2355,dtype_cast_182,call_function,dtype_cast.default,forward,20,1,1,1,1,3856,3
-2356,permute_220,call_function,permute.default,forward,20,1,1,1,2,3855,3
-2357,alias_default_568,call_function,alias.default,forward,20,1,1,6,2222,3868,4
-2358,alias_default_569,call_function,alias.default,forward,20,1,1,2,3,3854,3
-2359,einsum_default_140,call_function,einsum.default,forward,20,2,2,1,2227,3852,5
-2360,dtype_cast_183,call_function,dtype_cast.default,forward,20,1,1,1,1,3856,3
-2361,permute_221,call_function,permute.default,forward,20,1,1,1,2,3855,3
-2362,alias_default_570,call_function,alias.default,forward,20,1,1,2,3,3854,3
-2363,einsum_default_141,call_function,einsum.default,forward,20,2,2,1,2227,3852,5
-2364,dtype_cast_184,call_function,dtype_cast.default,forward,20,1,1,1,1,3849,3
-2365,permute_222,call_function,permute.default,forward,20,1,1,1,2,3848,3
-2366,alias_default_571,call_function,alias.default,forward,20,1,1,2,3,3847,3
-2367,einsum_default_142,call_function,einsum.default,forward,20,2,2,1,2227,3845,5
-2368,view_466,call_function,view.default,forward,20,1,1,1,2228,3851,4
-2369,view_467,call_function,view.default,forward,20,1,1,1,2228,3851,4
-2370,view_468,call_function,view.default,forward,20,1,1,1,2228,3844,4
-2371,convert_element_type_488,call_function,convert_element_type.default,forward,20,1,1,1,2229,3850,4
-2372,view_469,call_function,view.default,forward,20,1,1,1,2230,3849,4
-2373,view_as_complex_40,call_function,view_as_complex.default,forward,20,1,1,1,2231,3848,6
-2374,convert_element_type_489,call_function,convert_element_type.default,forward,20,1,1,1,2229,3850,4
-2375,view_470,call_function,view.default,forward,20,1,1,1,2230,3849,4
-2376,view_as_complex_41,call_function,view_as_complex.default,forward,20,1,1,1,2231,3848,6
-2377,view_471,call_function,view.default,forward,20,1,1,1,2,3859,3
-2378,alias_default_572,call_function,alias.default,forward,20,1,1,4,3,3858,3
-2379,mul_142,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8
-2380,view_as_real_40,call_function,view_as_real.default,forward,20,1,1,1,2235,3846,6
-2381,view_472,call_function,view.default,forward,20,1,1,1,2236,3845,6
-2382,mul_143,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8
-2383,view_as_real_41,call_function,view_as_real.default,forward,20,1,1,1,2235,3846,6
-2384,view_473,call_function,view.default,forward,20,1,1,1,2236,3845,6
-2385,convert_element_type_490,call_function,convert_element_type.default,forward,20,1,1,1,2237,3844,6
-2386,convert_element_type_491,call_function,convert_element_type.default,forward,20,1,1,1,2237,3844,6
-2387,permute_223,call_function,permute.default,forward,20,1,1,1,2238,3843,6
-2388,permute_224,call_function,permute.default,forward,20,1,1,1,2238,3843,6
-2389,permute_225,call_function,permute.default,forward,20,1,1,1,2229,3843,4
-2390,alias_default_573,call_function,alias.default,forward,20,1,1,2,2239,3842,4
-2391,alias_default_574,call_function,alias.default,forward,20,1,1,2,2239,3842,4
-2392,alias_default_575,call_function,alias.default,forward,20,1,1,2,2230,3842,4
-2393,_scaled_dot_product_flash_attention_20,call_function,_scaled_dot_product_flash_attention.default,forward,20,3,3,4,2263,3841,2
-2394,getitem_180,call_function,getitem,forward,20,1,1,1,2264,3837,2
-2395,getitem_181,call_function,getitem,forward,20,1,1,1,2264,2264,2
-2396,getitem_186,call_function,getitem,forward,20,1,1,1,2264,2264,1
-2397,getitem_187,call_function,getitem,forward,20,1,1,1,2264,2264,1
-2398,alias_default_576,call_function,alias.default,forward,20,1,1,2,2265,3836,4
-2399,permute_226,call_function,permute.default,forward,20,1,1,1,2266,3835,4
-2400,view_474,call_function,view.default,forward,20,1,1,1,2267,3834,3
-2401,dtype_cast_185,call_function,dtype_cast.default,forward,20,1,1,1,1,3836,3
-2402,permute_227,call_function,permute.default,forward,20,1,1,1,2,3835,3
-2403,alias_default_577,call_function,alias.default,forward,20,1,1,2,2268,3833,4
-2404,alias_default_578,call_function,alias.default,forward,20,1,1,2,3,3834,3
-2405,einsum_default_143,call_function,einsum.default,forward,20,2,2,1,2273,3832,5
-2406,add_101,call_function,add.Tensor,forward,20,2,2,1,2274,3831,10
-2407,dtype_cast_186,call_function,dtype_cast.default,forward,20,1,1,1,1,3820,2
-2408,alias_default_579,call_function,alias.default,forward,20,1,1,3,2275,3830,4
-2409,convert_element_type_494,call_function,convert_element_type.default,forward,20,1,1,1,2276,3828,4
-2410,alias_default_581,call_function,alias.default,forward,20,1,1,2,2277,3827,4
-2411,pow_42,call_function,pow.Tensor_Scalar,forward,20,1,1,1,2278,3826,4
-2412,mean_41,call_function,mean.dim,forward,20,1,1,1,2279,3825,4
-2413,add_102,call_function,add.Scalar,forward,20,1,1,1,2280,3824,3
-2414,rsqrt_41,call_function,rsqrt.default,forward,20,1,1,1,2281,3823,3
-2415,alias_default_582,call_function,alias.default,forward,20,1,1,3,2282,3822,3
-2416,mul_144,call_function,mul.Tensor,forward,20,2,2,1,2283,3818,8
-2417,alias_default_580,call_function,alias.default,forward,20,1,1,2,2,3819,2
-2418,mul_145,call_function,mul.Tensor,forward,20,2,2,1,2287,3817,8
-2419,convert_element_type_495,call_function,convert_element_type.default,forward,20,1,1,1,2288,3816,6
-2420,dtype_cast_187,call_function,dtype_cast.default,forward,20,1,1,1,1,3816,3
-2421,permute_228,call_function,permute.default,forward,20,1,1,1,2,3815,3
-2422,alias_default_583,call_function,alias.default,forward,20,1,1,4,2289,3815,4
-2423,alias_default_584,call_function,alias.default,forward,20,1,1,2,3,3814,3
-2424,einsum_default_144,call_function,einsum.default,forward,20,2,2,1,2294,3812,5
-2425,alias_default_585,call_function,alias.default,forward,20,1,1,2,2295,3811,4
-2426,convert_element_type_498,call_function,convert_element_type.default,forward,20,1,1,1,2296,3799,4
-2427,alias_default_586,call_function,alias.default,forward,20,1,1,2,2297,3798,4
-2428,neg_20,call_function,neg.default,forward,20,1,1,1,2298,3797,8
-2429,exp_20,call_function,exp.default,forward,20,1,1,1,2299,3796,6
-2430,add_103,call_function,add.Tensor,forward,20,1,1,1,2300,3795,4
-2431,div_20,call_function,div.Tensor,forward,20,2,2,1,2301,3794,6
-2432,convert_element_type_499,call_function,convert_element_type.default,forward,20,1,1,1,2302,3793,6
-2433,dtype_cast_188,call_function,dtype_cast.default,forward,20,1,1,1,1,3797,3
-2434,permute_229,call_function,permute.default,forward,20,1,1,1,2,3796,3
-2435,alias_default_588,call_function,alias.default,forward,20,1,1,2,3,3795,3
-2436,einsum_default_145,call_function,einsum.default,forward,20,2,2,1,2294,3793,5
-2437,alias_default_587,call_function,alias.default,forward,20,1,1,2,2303,3792,4
-2438,alias_default_589,call_function,alias.default,forward,20,1,1,2,2295,3792,4
-2439,mul_146,call_function,mul.Tensor,forward,20,2,2,1,2310,3791,8
-2440,dtype_cast_189,call_function,dtype_cast.default,forward,20,1,1,1,1,3793,3
-2441,permute_230,call_function,permute.default,forward,20,1,1,1,2,3792,3
-2442,alias_default_590,call_function,alias.default,forward,20,1,1,2,2311,3790,4
-2443,alias_default_591,call_function,alias.default,forward,20,1,1,2,3,3791,3
-2444,einsum_default_146,call_function,einsum.default,forward,20,2,2,1,2316,3789,5
-2445,add_104,call_function,add.Tensor,forward,20,2,2,1,2317,3788,10
-2446,dtype_cast_190,call_function,dtype_cast.default,forward,21,1,1,1,1,3777,2
-2447,alias_default_592,call_function,alias.default,forward,20,1,1,3,2318,3787,4
-2448,convert_element_type_504,call_function,convert_element_type.default,forward,21,1,1,1,2319,3785,4
-2449,alias_default_594,call_function,alias.default,forward,21,1,1,2,2320,3784,4
-2450,pow_43,call_function,pow.Tensor_Scalar,forward,21,1,1,1,2321,3783,4
-2451,mean_42,call_function,mean.dim,forward,21,1,1,1,2322,3782,4
-2452,add_105,call_function,add.Scalar,forward,21,1,1,1,2323,3781,3
-2453,rsqrt_42,call_function,rsqrt.default,forward,21,1,1,1,2324,3780,3
-2454,alias_default_595,call_function,alias.default,forward,21,1,1,3,2325,3779,3
-2455,mul_147,call_function,mul.Tensor,forward,21,2,2,1,2326,3775,8
-2456,alias_default_593,call_function,alias.default,forward,21,1,1,2,2,3776,2
-2457,mul_148,call_function,mul.Tensor,forward,21,2,2,1,2330,3774,8
-2458,convert_element_type_505,call_function,convert_element_type.default,forward,21,1,1,1,2331,3773,6
-2459,dtype_cast_191,call_function,dtype_cast.default,forward,21,1,1,1,1,3760,3
-2460,permute_231,call_function,permute.default,forward,21,1,1,1,2,3759,3
-2461,alias_default_596,call_function,alias.default,forward,21,1,1,6,2332,3772,4
-2462,alias_default_597,call_function,alias.default,forward,21,1,1,2,3,3758,3
-2463,einsum_default_147,call_function,einsum.default,forward,21,2,2,1,2337,3756,5
-2464,dtype_cast_192,call_function,dtype_cast.default,forward,21,1,1,1,1,3760,3
-2465,permute_232,call_function,permute.default,forward,21,1,1,1,2,3759,3
-2466,alias_default_598,call_function,alias.default,forward,21,1,1,2,3,3758,3
-2467,einsum_default_148,call_function,einsum.default,forward,21,2,2,1,2337,3756,5
-2468,dtype_cast_193,call_function,dtype_cast.default,forward,21,1,1,1,1,3753,3
-2469,permute_233,call_function,permute.default,forward,21,1,1,1,2,3752,3
-2470,alias_default_599,call_function,alias.default,forward,21,1,1,2,3,3751,3
-2471,einsum_default_149,call_function,einsum.default,forward,21,2,2,1,2337,3749,5
-2472,view_489,call_function,view.default,forward,21,1,1,1,2338,3755,4
-2473,view_490,call_function,view.default,forward,21,1,1,1,2338,3755,4
-2474,view_491,call_function,view.default,forward,21,1,1,1,2338,3748,4
-2475,convert_element_type_512,call_function,convert_element_type.default,forward,21,1,1,1,2339,3754,4
-2476,view_492,call_function,view.default,forward,21,1,1,1,2340,3753,4
-2477,view_as_complex_42,call_function,view_as_complex.default,forward,21,1,1,1,2341,3752,6
-2478,convert_element_type_513,call_function,convert_element_type.default,forward,21,1,1,1,2339,3754,4
-2479,view_493,call_function,view.default,forward,21,1,1,1,2340,3753,4
-2480,view_as_complex_43,call_function,view_as_complex.default,forward,21,1,1,1,2341,3752,6
-2481,view_494,call_function,view.default,forward,21,1,1,1,2,3763,3
-2482,alias_default_600,call_function,alias.default,forward,21,1,1,4,3,3762,3
-2483,mul_149,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8
-2484,view_as_real_42,call_function,view_as_real.default,forward,21,1,1,1,2345,3750,6
-2485,view_495,call_function,view.default,forward,21,1,1,1,2346,3749,6
-2486,mul_150,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8
-2487,view_as_real_43,call_function,view_as_real.default,forward,21,1,1,1,2345,3750,6
-2488,view_496,call_function,view.default,forward,21,1,1,1,2346,3749,6
-2489,convert_element_type_514,call_function,convert_element_type.default,forward,21,1,1,1,2347,3748,6
-2490,convert_element_type_515,call_function,convert_element_type.default,forward,21,1,1,1,2347,3748,6
-2491,permute_234,call_function,permute.default,forward,21,1,1,1,2348,3747,6
-2492,permute_235,call_function,permute.default,forward,21,1,1,1,2348,3747,6
-2493,permute_236,call_function,permute.default,forward,21,1,1,1,2339,3747,4
-2494,alias_default_601,call_function,alias.default,forward,21,1,1,2,2349,3746,4
-2495,alias_default_602,call_function,alias.default,forward,21,1,1,2,2349,3746,4
-2496,alias_default_603,call_function,alias.default,forward,21,1,1,2,2340,3746,4
-2497,_scaled_dot_product_flash_attention_21,call_function,_scaled_dot_product_flash_attention.default,forward,21,3,3,4,2373,3745,2
-2498,getitem_189,call_function,getitem,forward,21,1,1,1,2374,3741,2
-2499,getitem_190,call_function,getitem,forward,21,1,1,1,2374,2374,2
-2500,getitem_195,call_function,getitem,forward,21,1,1,1,2374,2374,1
-2501,getitem_196,call_function,getitem,forward,21,1,1,1,2374,2374,1
-2502,alias_default_604,call_function,alias.default,forward,21,1,1,2,2375,3740,4
-2503,permute_237,call_function,permute.default,forward,21,1,1,1,2376,3739,4
-2504,view_497,call_function,view.default,forward,21,1,1,1,2377,3738,3
-2505,dtype_cast_194,call_function,dtype_cast.default,forward,21,1,1,1,1,3740,3
-2506,permute_238,call_function,permute.default,forward,21,1,1,1,2,3739,3
-2507,alias_default_605,call_function,alias.default,forward,21,1,1,2,2378,3737,4
-2508,alias_default_606,call_function,alias.default,forward,21,1,1,2,3,3738,3
-2509,einsum_default_150,call_function,einsum.default,forward,21,2,2,1,2383,3736,5
-2510,add_106,call_function,add.Tensor,forward,21,2,2,1,2384,3735,10
-2511,dtype_cast_195,call_function,dtype_cast.default,forward,21,1,1,1,1,3724,2
-2512,alias_default_607,call_function,alias.default,forward,21,1,1,3,2385,3734,4
-2513,convert_element_type_518,call_function,convert_element_type.default,forward,21,1,1,1,2386,3732,4
-2514,alias_default_609,call_function,alias.default,forward,21,1,1,2,2387,3731,4
-2515,pow_44,call_function,pow.Tensor_Scalar,forward,21,1,1,1,2388,3730,4
-2516,mean_43,call_function,mean.dim,forward,21,1,1,1,2389,3729,4
-2517,add_107,call_function,add.Scalar,forward,21,1,1,1,2390,3728,3
-2518,rsqrt_43,call_function,rsqrt.default,forward,21,1,1,1,2391,3727,3
-2519,alias_default_610,call_function,alias.default,forward,21,1,1,3,2392,3726,3
-2520,mul_151,call_function,mul.Tensor,forward,21,2,2,1,2393,3722,8
-2521,alias_default_608,call_function,alias.default,forward,21,1,1,2,2,3723,2
-2522,mul_152,call_function,mul.Tensor,forward,21,2,2,1,2397,3721,8
-2523,convert_element_type_519,call_function,convert_element_type.default,forward,21,1,1,1,2398,3720,6
-2524,dtype_cast_196,call_function,dtype_cast.default,forward,21,1,1,1,1,3720,3
-2525,permute_239,call_function,permute.default,forward,21,1,1,1,2,3719,3
-2526,alias_default_611,call_function,alias.default,forward,21,1,1,4,2399,3719,4
-2527,alias_default_612,call_function,alias.default,forward,21,1,1,2,3,3718,3
-2528,einsum_default_151,call_function,einsum.default,forward,21,2,2,1,2404,3716,5
-2529,alias_default_613,call_function,alias.default,forward,21,1,1,2,2405,3715,4
-2530,convert_element_type_522,call_function,convert_element_type.default,forward,21,1,1,1,2406,3703,4
-2531,alias_default_614,call_function,alias.default,forward,21,1,1,2,2407,3702,4
-2532,neg_21,call_function,neg.default,forward,21,1,1,1,2408,3701,8
-2533,exp_21,call_function,exp.default,forward,21,1,1,1,2409,3700,6
-2534,add_108,call_function,add.Tensor,forward,21,1,1,1,2410,3699,4
-2535,div_21,call_function,div.Tensor,forward,21,2,2,1,2411,3698,6
-2536,convert_element_type_523,call_function,convert_element_type.default,forward,21,1,1,1,2412,3697,6
-2537,dtype_cast_197,call_function,dtype_cast.default,forward,21,1,1,1,1,3701,3
-2538,permute_240,call_function,permute.default,forward,21,1,1,1,2,3700,3
-2539,alias_default_616,call_function,alias.default,forward,21,1,1,2,3,3699,3
-2540,einsum_default_152,call_function,einsum.default,forward,21,2,2,1,2404,3697,5
-2541,alias_default_615,call_function,alias.default,forward,21,1,1,2,2413,3696,4
-2542,alias_default_617,call_function,alias.default,forward,21,1,1,2,2405,3696,4
-2543,mul_153,call_function,mul.Tensor,forward,21,2,2,1,2420,3695,8
-2544,dtype_cast_198,call_function,dtype_cast.default,forward,21,1,1,1,1,3697,3
-2545,permute_241,call_function,permute.default,forward,21,1,1,1,2,3696,3
-2546,alias_default_618,call_function,alias.default,forward,21,1,1,2,2421,3694,4
-2547,alias_default_619,call_function,alias.default,forward,21,1,1,2,3,3695,3
-2548,einsum_default_153,call_function,einsum.default,forward,21,2,2,1,2426,3693,5
-2549,add_109,call_function,add.Tensor,forward,21,2,2,1,2427,3692,10
-2550,dtype_cast_199,call_function,dtype_cast.default,forward,22,1,1,1,1,3681,2
-2551,alias_default_620,call_function,alias.default,forward,21,1,1,3,2428,3691,4
-2552,convert_element_type_528,call_function,convert_element_type.default,forward,22,1,1,1,2429,3689,4
-2553,alias_default_622,call_function,alias.default,forward,22,1,1,2,2430,3688,4
-2554,pow_45,call_function,pow.Tensor_Scalar,forward,22,1,1,1,2431,3687,4
-2555,mean_44,call_function,mean.dim,forward,22,1,1,1,2432,3686,4
-2556,add_110,call_function,add.Scalar,forward,22,1,1,1,2433,3685,3
-2557,rsqrt_44,call_function,rsqrt.default,forward,22,1,1,1,2434,3684,3
-2558,alias_default_623,call_function,alias.default,forward,22,1,1,3,2435,3683,3
-2559,mul_154,call_function,mul.Tensor,forward,22,2,2,1,2436,3679,8
-2560,alias_default_621,call_function,alias.default,forward,22,1,1,2,2,3680,2
-2561,mul_155,call_function,mul.Tensor,forward,22,2,2,1,2440,3678,8
-2562,convert_element_type_529,call_function,convert_element_type.default,forward,22,1,1,1,2441,3677,6
-2563,dtype_cast_200,call_function,dtype_cast.default,forward,22,1,1,1,1,3664,3
-2564,permute_242,call_function,permute.default,forward,22,1,1,1,2,3663,3
-2565,alias_default_624,call_function,alias.default,forward,22,1,1,6,2442,3676,4
-2566,alias_default_625,call_function,alias.default,forward,22,1,1,2,3,3662,3
-2567,einsum_default_154,call_function,einsum.default,forward,22,2,2,1,2447,3660,5
-2568,dtype_cast_201,call_function,dtype_cast.default,forward,22,1,1,1,1,3664,3
-2569,permute_243,call_function,permute.default,forward,22,1,1,1,2,3663,3
-2570,alias_default_626,call_function,alias.default,forward,22,1,1,2,3,3662,3
-2571,einsum_default_155,call_function,einsum.default,forward,22,2,2,1,2447,3660,5
-2572,dtype_cast_202,call_function,dtype_cast.default,forward,22,1,1,1,1,3657,3
-2573,permute_244,call_function,permute.default,forward,22,1,1,1,2,3656,3
-2574,alias_default_627,call_function,alias.default,forward,22,1,1,2,3,3655,3
-2575,einsum_default_156,call_function,einsum.default,forward,22,2,2,1,2447,3653,5
-2576,view_512,call_function,view.default,forward,22,1,1,1,2448,3659,4
-2577,view_513,call_function,view.default,forward,22,1,1,1,2448,3659,4
-2578,view_514,call_function,view.default,forward,22,1,1,1,2448,3652,4
-2579,convert_element_type_536,call_function,convert_element_type.default,forward,22,1,1,1,2449,3658,4
-2580,view_515,call_function,view.default,forward,22,1,1,1,2450,3657,4
-2581,view_as_complex_44,call_function,view_as_complex.default,forward,22,1,1,1,2451,3656,6
-2582,convert_element_type_537,call_function,convert_element_type.default,forward,22,1,1,1,2449,3658,4
-2583,view_516,call_function,view.default,forward,22,1,1,1,2450,3657,4
-2584,view_as_complex_45,call_function,view_as_complex.default,forward,22,1,1,1,2451,3656,6
-2585,view_517,call_function,view.default,forward,22,1,1,1,2,3667,3
-2586,alias_default_628,call_function,alias.default,forward,22,1,1,4,3,3666,3
-2587,mul_156,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8
-2588,view_as_real_44,call_function,view_as_real.default,forward,22,1,1,1,2455,3654,6
-2589,view_518,call_function,view.default,forward,22,1,1,1,2456,3653,6
-2590,mul_157,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8
-2591,view_as_real_45,call_function,view_as_real.default,forward,22,1,1,1,2455,3654,6
-2592,view_519,call_function,view.default,forward,22,1,1,1,2456,3653,6
-2593,convert_element_type_538,call_function,convert_element_type.default,forward,22,1,1,1,2457,3652,6
-2594,convert_element_type_539,call_function,convert_element_type.default,forward,22,1,1,1,2457,3652,6
-2595,permute_245,call_function,permute.default,forward,22,1,1,1,2458,3651,6
-2596,permute_246,call_function,permute.default,forward,22,1,1,1,2458,3651,6
-2597,permute_247,call_function,permute.default,forward,22,1,1,1,2449,3651,4
-2598,alias_default_629,call_function,alias.default,forward,22,1,1,2,2459,3650,4
-2599,alias_default_630,call_function,alias.default,forward,22,1,1,2,2459,3650,4
-2600,alias_default_631,call_function,alias.default,forward,22,1,1,2,2450,3650,4
-2601,_scaled_dot_product_flash_attention_22,call_function,_scaled_dot_product_flash_attention.default,forward,22,3,3,4,2483,3649,2
-2602,getitem_198,call_function,getitem,forward,22,1,1,1,2484,3645,2
-2603,getitem_199,call_function,getitem,forward,22,1,1,1,2484,2484,2
-2604,getitem_204,call_function,getitem,forward,22,1,1,1,2484,2484,1
-2605,getitem_205,call_function,getitem,forward,22,1,1,1,2484,2484,1
-2606,alias_default_632,call_function,alias.default,forward,22,1,1,2,2485,3644,4
-2607,permute_248,call_function,permute.default,forward,22,1,1,1,2486,3643,4
-2608,view_520,call_function,view.default,forward,22,1,1,1,2487,3642,3
-2609,dtype_cast_203,call_function,dtype_cast.default,forward,22,1,1,1,1,3644,3
-2610,permute_249,call_function,permute.default,forward,22,1,1,1,2,3643,3
-2611,alias_default_633,call_function,alias.default,forward,22,1,1,2,2488,3641,4
-2612,alias_default_634,call_function,alias.default,forward,22,1,1,2,3,3642,3
-2613,einsum_default_157,call_function,einsum.default,forward,22,2,2,1,2493,3640,5
-2614,add_111,call_function,add.Tensor,forward,22,2,2,1,2494,3639,10
-2615,dtype_cast_204,call_function,dtype_cast.default,forward,22,1,1,1,1,3628,2
-2616,alias_default_635,call_function,alias.default,forward,22,1,1,3,2495,3638,4
-2617,convert_element_type_542,call_function,convert_element_type.default,forward,22,1,1,1,2496,3636,4
-2618,alias_default_637,call_function,alias.default,forward,22,1,1,2,2497,3635,4
-2619,pow_46,call_function,pow.Tensor_Scalar,forward,22,1,1,1,2498,3634,4
-2620,mean_45,call_function,mean.dim,forward,22,1,1,1,2499,3633,4
-2621,add_112,call_function,add.Scalar,forward,22,1,1,1,2500,3632,3
-2622,rsqrt_45,call_function,rsqrt.default,forward,22,1,1,1,2501,3631,3
-2623,alias_default_638,call_function,alias.default,forward,22,1,1,3,2502,3630,3
-2624,mul_158,call_function,mul.Tensor,forward,22,2,2,1,2503,3626,8
-2625,alias_default_636,call_function,alias.default,forward,22,1,1,2,2,3627,2
-2626,mul_159,call_function,mul.Tensor,forward,22,2,2,1,2507,3625,8
-2627,convert_element_type_543,call_function,convert_element_type.default,forward,22,1,1,1,2508,3624,6
-2628,dtype_cast_205,call_function,dtype_cast.default,forward,22,1,1,1,1,3624,3
-2629,permute_250,call_function,permute.default,forward,22,1,1,1,2,3623,3
-2630,alias_default_639,call_function,alias.default,forward,22,1,1,4,2509,3623,4
-2631,alias_default_640,call_function,alias.default,forward,22,1,1,2,3,3622,3
-2632,einsum_default_158,call_function,einsum.default,forward,22,2,2,1,2514,3620,5
-2633,alias_default_641,call_function,alias.default,forward,22,1,1,2,2515,3619,4
-2634,convert_element_type_546,call_function,convert_element_type.default,forward,22,1,1,1,2516,3607,4
-2635,alias_default_642,call_function,alias.default,forward,22,1,1,2,2517,3606,4
-2636,neg_22,call_function,neg.default,forward,22,1,1,1,2518,3605,8
-2637,exp_22,call_function,exp.default,forward,22,1,1,1,2519,3604,6
-2638,add_113,call_function,add.Tensor,forward,22,1,1,1,2520,3603,4
-2639,div_22,call_function,div.Tensor,forward,22,2,2,1,2521,3602,6
-2640,convert_element_type_547,call_function,convert_element_type.default,forward,22,1,1,1,2522,3601,6
-2641,dtype_cast_206,call_function,dtype_cast.default,forward,22,1,1,1,1,3605,3
-2642,permute_251,call_function,permute.default,forward,22,1,1,1,2,3604,3
-2643,alias_default_644,call_function,alias.default,forward,22,1,1,2,3,3603,3
-2644,einsum_default_159,call_function,einsum.default,forward,22,2,2,1,2514,3601,5
-2645,alias_default_643,call_function,alias.default,forward,22,1,1,2,2523,3600,4
-2646,alias_default_645,call_function,alias.default,forward,22,1,1,2,2515,3600,4
-2647,mul_160,call_function,mul.Tensor,forward,22,2,2,1,2530,3599,8
-2648,dtype_cast_207,call_function,dtype_cast.default,forward,22,1,1,1,1,3601,3
-2649,permute_252,call_function,permute.default,forward,22,1,1,1,2,3600,3
-2650,alias_default_646,call_function,alias.default,forward,22,1,1,2,2531,3598,4
-2651,alias_default_647,call_function,alias.default,forward,22,1,1,2,3,3599,3
-2652,einsum_default_160,call_function,einsum.default,forward,22,2,2,1,2536,3597,5
-2653,add_114,call_function,add.Tensor,forward,22,2,2,1,2537,3596,10
-2654,dtype_cast_208,call_function,dtype_cast.default,forward,23,1,1,1,1,3585,2
-2655,alias_default_648,call_function,alias.default,forward,22,1,1,3,2538,3595,4
-2656,convert_element_type_552,call_function,convert_element_type.default,forward,23,1,1,1,2539,3593,4
-2657,alias_default_650,call_function,alias.default,forward,23,1,1,2,2540,3592,4
-2658,pow_47,call_function,pow.Tensor_Scalar,forward,23,1,1,1,2541,3591,4
-2659,mean_46,call_function,mean.dim,forward,23,1,1,1,2542,3590,4
-2660,add_115,call_function,add.Scalar,forward,23,1,1,1,2543,3589,3
-2661,rsqrt_46,call_function,rsqrt.default,forward,23,1,1,1,2544,3588,3
-2662,alias_default_651,call_function,alias.default,forward,23,1,1,3,2545,3587,3
-2663,mul_161,call_function,mul.Tensor,forward,23,2,2,1,2546,3583,8
-2664,alias_default_649,call_function,alias.default,forward,23,1,1,2,2,3584,2
-2665,mul_162,call_function,mul.Tensor,forward,23,2,2,1,2550,3582,8
-2666,convert_element_type_553,call_function,convert_element_type.default,forward,23,1,1,1,2551,3581,6
-2667,dtype_cast_209,call_function,dtype_cast.default,forward,23,1,1,1,1,3568,3
-2668,permute_253,call_function,permute.default,forward,23,1,1,1,2,3567,3
-2669,alias_default_652,call_function,alias.default,forward,23,1,1,6,2552,3580,4
-2670,alias_default_653,call_function,alias.default,forward,23,1,1,2,3,3566,3
-2671,einsum_default_161,call_function,einsum.default,forward,23,2,2,1,2557,3564,5
-2672,dtype_cast_210,call_function,dtype_cast.default,forward,23,1,1,1,1,3568,3
-2673,permute_254,call_function,permute.default,forward,23,1,1,1,2,3567,3
-2674,alias_default_654,call_function,alias.default,forward,23,1,1,2,3,3566,3
-2675,einsum_default_162,call_function,einsum.default,forward,23,2,2,1,2557,3564,5
-2676,dtype_cast_211,call_function,dtype_cast.default,forward,23,1,1,1,1,3561,3
-2677,permute_255,call_function,permute.default,forward,23,1,1,1,2,3560,3
-2678,alias_default_655,call_function,alias.default,forward,23,1,1,2,3,3559,3
-2679,einsum_default_163,call_function,einsum.default,forward,23,2,2,1,2557,3557,5
-2680,view_535,call_function,view.default,forward,23,1,1,1,2558,3563,4
-2681,view_536,call_function,view.default,forward,23,1,1,1,2558,3563,4
-2682,view_537,call_function,view.default,forward,23,1,1,1,2558,3556,4
-2683,convert_element_type_560,call_function,convert_element_type.default,forward,23,1,1,1,2559,3562,4
-2684,view_538,call_function,view.default,forward,23,1,1,1,2560,3561,4
-2685,view_as_complex_46,call_function,view_as_complex.default,forward,23,1,1,1,2561,3560,6
-2686,convert_element_type_561,call_function,convert_element_type.default,forward,23,1,1,1,2559,3562,4
-2687,view_539,call_function,view.default,forward,23,1,1,1,2560,3561,4
-2688,view_as_complex_47,call_function,view_as_complex.default,forward,23,1,1,1,2561,3560,6
-2689,view_540,call_function,view.default,forward,23,1,1,1,2,3571,3
-2690,alias_default_656,call_function,alias.default,forward,23,1,1,4,3,3570,3
-2691,mul_163,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8
-2692,view_as_real_46,call_function,view_as_real.default,forward,23,1,1,1,2565,3558,6
-2693,view_541,call_function,view.default,forward,23,1,1,1,2566,3557,6
-2694,mul_164,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8
-2695,view_as_real_47,call_function,view_as_real.default,forward,23,1,1,1,2565,3558,6
-2696,view_542,call_function,view.default,forward,23,1,1,1,2566,3557,6
-2697,convert_element_type_562,call_function,convert_element_type.default,forward,23,1,1,1,2567,3556,6
-2698,convert_element_type_563,call_function,convert_element_type.default,forward,23,1,1,1,2567,3556,6
-2699,permute_256,call_function,permute.default,forward,23,1,1,1,2568,3555,6
-2700,permute_257,call_function,permute.default,forward,23,1,1,1,2568,3555,6
-2701,permute_258,call_function,permute.default,forward,23,1,1,1,2559,3555,4
-2702,alias_default_657,call_function,alias.default,forward,23,1,1,2,2569,3554,4
-2703,alias_default_658,call_function,alias.default,forward,23,1,1,2,2569,3554,4
-2704,alias_default_659,call_function,alias.default,forward,23,1,1,2,2560,3554,4
-2705,_scaled_dot_product_flash_attention_23,call_function,_scaled_dot_product_flash_attention.default,forward,23,3,3,4,2593,3553,2
-2706,getitem_207,call_function,getitem,forward,23,1,1,1,2594,3549,2
-2707,getitem_208,call_function,getitem,forward,23,1,1,1,2594,2594,2
-2708,getitem_213,call_function,getitem,forward,23,1,1,1,2594,2594,1
-2709,getitem_214,call_function,getitem,forward,23,1,1,1,2594,2594,1
-2710,alias_default_660,call_function,alias.default,forward,23,1,1,2,2595,3548,4
-2711,permute_259,call_function,permute.default,forward,23,1,1,1,2596,3547,4
-2712,view_543,call_function,view.default,forward,23,1,1,1,2597,3546,3
-2713,dtype_cast_212,call_function,dtype_cast.default,forward,23,1,1,1,1,3548,3
-2714,permute_260,call_function,permute.default,forward,23,1,1,1,2,3547,3
-2715,alias_default_661,call_function,alias.default,forward,23,1,1,2,2598,3545,4
-2716,alias_default_662,call_function,alias.default,forward,23,1,1,2,3,3546,3
-2717,einsum_default_164,call_function,einsum.default,forward,23,2,2,1,2603,3544,5
-2718,add_116,call_function,add.Tensor,forward,23,2,2,1,2604,3543,10
-2719,dtype_cast_213,call_function,dtype_cast.default,forward,23,1,1,1,1,3532,2
-2720,alias_default_663,call_function,alias.default,forward,23,1,1,3,2605,3542,4
-2721,convert_element_type_566,call_function,convert_element_type.default,forward,23,1,1,1,2606,3540,4
-2722,alias_default_665,call_function,alias.default,forward,23,1,1,2,2607,3539,4
-2723,pow_48,call_function,pow.Tensor_Scalar,forward,23,1,1,1,2608,3538,4
-2724,mean_47,call_function,mean.dim,forward,23,1,1,1,2609,3537,4
-2725,add_117,call_function,add.Scalar,forward,23,1,1,1,2610,3536,3
-2726,rsqrt_47,call_function,rsqrt.default,forward,23,1,1,1,2611,3535,3
-2727,alias_default_666,call_function,alias.default,forward,23,1,1,3,2612,3534,3
-2728,mul_165,call_function,mul.Tensor,forward,23,2,2,1,2613,3530,8
-2729,alias_default_664,call_function,alias.default,forward,23,1,1,2,2,3531,2
-2730,mul_166,call_function,mul.Tensor,forward,23,2,2,1,2617,3529,8
-2731,convert_element_type_567,call_function,convert_element_type.default,forward,23,1,1,1,2618,3528,6
-2732,dtype_cast_214,call_function,dtype_cast.default,forward,23,1,1,1,1,3528,3
-2733,permute_261,call_function,permute.default,forward,23,1,1,1,2,3527,3
-2734,alias_default_667,call_function,alias.default,forward,23,1,1,4,2619,3527,4
-2735,alias_default_668,call_function,alias.default,forward,23,1,1,2,3,3526,3
-2736,einsum_default_165,call_function,einsum.default,forward,23,2,2,1,2624,3524,5
-2737,alias_default_669,call_function,alias.default,forward,23,1,1,2,2625,3523,4
-2738,convert_element_type_570,call_function,convert_element_type.default,forward,23,1,1,1,2626,3511,4
-2739,alias_default_670,call_function,alias.default,forward,23,1,1,2,2627,3510,4
-2740,neg_23,call_function,neg.default,forward,23,1,1,1,2628,3509,8
-2741,exp_23,call_function,exp.default,forward,23,1,1,1,2629,3508,6
-2742,add_118,call_function,add.Tensor,forward,23,1,1,1,2630,3507,4
-2743,div_23,call_function,div.Tensor,forward,23,2,2,1,2631,3506,6
-2744,convert_element_type_571,call_function,convert_element_type.default,forward,23,1,1,1,2632,3505,6
-2745,dtype_cast_215,call_function,dtype_cast.default,forward,23,1,1,1,1,3509,3
-2746,permute_262,call_function,permute.default,forward,23,1,1,1,2,3508,3
-2747,alias_default_672,call_function,alias.default,forward,23,1,1,2,3,3507,3
-2748,einsum_default_166,call_function,einsum.default,forward,23,2,2,1,2624,3505,5
-2749,alias_default_671,call_function,alias.default,forward,23,1,1,2,2633,3504,4
-2750,alias_default_673,call_function,alias.default,forward,23,1,1,2,2625,3504,4
-2751,mul_167,call_function,mul.Tensor,forward,23,2,2,1,2640,3503,8
-2752,dtype_cast_216,call_function,dtype_cast.default,forward,23,1,1,1,1,3505,3
-2753,permute_263,call_function,permute.default,forward,23,1,1,1,2,3504,3
-2754,alias_default_674,call_function,alias.default,forward,23,1,1,2,2641,3502,4
-2755,alias_default_675,call_function,alias.default,forward,23,1,1,2,3,3503,3
-2756,einsum_default_167,call_function,einsum.default,forward,23,2,2,1,2646,3501,5
-2757,add_119,call_function,add.Tensor,forward,23,2,2,1,2647,3500,10
-2758,dtype_cast_217,call_function,dtype_cast.default,forward,24,1,1,1,1,3489,2
-2759,alias_default_676,call_function,alias.default,forward,23,1,1,3,2648,3499,4
-2760,convert_element_type_576,call_function,convert_element_type.default,forward,24,1,1,1,2649,3497,4
-2761,alias_default_678,call_function,alias.default,forward,24,1,1,2,2650,3496,4
-2762,pow_49,call_function,pow.Tensor_Scalar,forward,24,1,1,1,2651,3495,4
-2763,mean_48,call_function,mean.dim,forward,24,1,1,1,2652,3494,4
-2764,add_120,call_function,add.Scalar,forward,24,1,1,1,2653,3493,3
-2765,rsqrt_48,call_function,rsqrt.default,forward,24,1,1,1,2654,3492,3
-2766,alias_default_679,call_function,alias.default,forward,24,1,1,3,2655,3491,3
-2767,mul_168,call_function,mul.Tensor,forward,24,2,2,1,2656,3487,8
-2768,alias_default_677,call_function,alias.default,forward,24,1,1,2,2,3488,2
-2769,mul_169,call_function,mul.Tensor,forward,24,2,2,1,2660,3486,8
-2770,convert_element_type_577,call_function,convert_element_type.default,forward,24,1,1,1,2661,3485,6
-2771,dtype_cast_218,call_function,dtype_cast.default,forward,24,1,1,1,1,3472,3
-2772,permute_264,call_function,permute.default,forward,24,1,1,1,2,3471,3
-2773,alias_default_680,call_function,alias.default,forward,24,1,1,6,2662,3484,4
-2774,alias_default_681,call_function,alias.default,forward,24,1,1,2,3,3470,3
-2775,einsum_default_168,call_function,einsum.default,forward,24,2,2,1,2667,3468,5
-2776,dtype_cast_219,call_function,dtype_cast.default,forward,24,1,1,1,1,3472,3
-2777,permute_265,call_function,permute.default,forward,24,1,1,1,2,3471,3
-2778,alias_default_682,call_function,alias.default,forward,24,1,1,2,3,3470,3
-2779,einsum_default_169,call_function,einsum.default,forward,24,2,2,1,2667,3468,5
-2780,dtype_cast_220,call_function,dtype_cast.default,forward,24,1,1,1,1,3465,3
-2781,permute_266,call_function,permute.default,forward,24,1,1,1,2,3464,3
-2782,alias_default_683,call_function,alias.default,forward,24,1,1,2,3,3463,3
-2783,einsum_default_170,call_function,einsum.default,forward,24,2,2,1,2667,3461,5
-2784,view_558,call_function,view.default,forward,24,1,1,1,2668,3467,4
-2785,view_559,call_function,view.default,forward,24,1,1,1,2668,3467,4
-2786,view_560,call_function,view.default,forward,24,1,1,1,2668,3460,4
-2787,convert_element_type_584,call_function,convert_element_type.default,forward,24,1,1,1,2669,3466,4
-2788,view_561,call_function,view.default,forward,24,1,1,1,2670,3465,4
-2789,view_as_complex_48,call_function,view_as_complex.default,forward,24,1,1,1,2671,3464,6
-2790,convert_element_type_585,call_function,convert_element_type.default,forward,24,1,1,1,2669,3466,4
-2791,view_562,call_function,view.default,forward,24,1,1,1,2670,3465,4
-2792,view_as_complex_49,call_function,view_as_complex.default,forward,24,1,1,1,2671,3464,6
-2793,view_563,call_function,view.default,forward,24,1,1,1,2,3475,3
-2794,alias_default_684,call_function,alias.default,forward,24,1,1,4,3,3474,3
-2795,mul_170,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8
-2796,view_as_real_48,call_function,view_as_real.default,forward,24,1,1,1,2675,3462,6
-2797,view_564,call_function,view.default,forward,24,1,1,1,2676,3461,6
-2798,mul_171,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8
-2799,view_as_real_49,call_function,view_as_real.default,forward,24,1,1,1,2675,3462,6
-2800,view_565,call_function,view.default,forward,24,1,1,1,2676,3461,6
-2801,convert_element_type_586,call_function,convert_element_type.default,forward,24,1,1,1,2677,3460,6
-2802,convert_element_type_587,call_function,convert_element_type.default,forward,24,1,1,1,2677,3460,6
-2803,permute_267,call_function,permute.default,forward,24,1,1,1,2678,3459,6
-2804,permute_268,call_function,permute.default,forward,24,1,1,1,2678,3459,6
-2805,permute_269,call_function,permute.default,forward,24,1,1,1,2669,3459,4
-2806,alias_default_685,call_function,alias.default,forward,24,1,1,2,2679,3458,4
-2807,alias_default_686,call_function,alias.default,forward,24,1,1,2,2679,3458,4
-2808,alias_default_687,call_function,alias.default,forward,24,1,1,2,2670,3458,4
-2809,_scaled_dot_product_flash_attention_24,call_function,_scaled_dot_product_flash_attention.default,forward,24,3,3,4,2703,3457,2
-2810,getitem_216,call_function,getitem,forward,24,1,1,1,2704,3453,2
-2811,getitem_217,call_function,getitem,forward,24,1,1,1,2704,2704,2
-2812,getitem_222,call_function,getitem,forward,24,1,1,1,2704,2704,1
-2813,getitem_223,call_function,getitem,forward,24,1,1,1,2704,2704,1
-2814,alias_default_688,call_function,alias.default,forward,24,1,1,2,2705,3452,4
-2815,permute_270,call_function,permute.default,forward,24,1,1,1,2706,3451,4
-2816,view_566,call_function,view.default,forward,24,1,1,1,2707,3450,3
-2817,dtype_cast_221,call_function,dtype_cast.default,forward,24,1,1,1,1,3452,3
-2818,permute_271,call_function,permute.default,forward,24,1,1,1,2,3451,3
-2819,alias_default_689,call_function,alias.default,forward,24,1,1,2,2708,3449,4
-2820,alias_default_690,call_function,alias.default,forward,24,1,1,2,3,3450,3
-2821,einsum_default_171,call_function,einsum.default,forward,24,2,2,1,2713,3448,5
-2822,add_121,call_function,add.Tensor,forward,24,2,2,1,2714,3447,10
-2823,dtype_cast_222,call_function,dtype_cast.default,forward,24,1,1,1,1,3436,2
-2824,alias_default_691,call_function,alias.default,forward,24,1,1,3,2715,3446,4
-2825,convert_element_type_590,call_function,convert_element_type.default,forward,24,1,1,1,2716,3444,4
-2826,alias_default_693,call_function,alias.default,forward,24,1,1,2,2717,3443,4
-2827,pow_50,call_function,pow.Tensor_Scalar,forward,24,1,1,1,2718,3442,4
-2828,mean_49,call_function,mean.dim,forward,24,1,1,1,2719,3441,4
-2829,add_122,call_function,add.Scalar,forward,24,1,1,1,2720,3440,3
-2830,rsqrt_49,call_function,rsqrt.default,forward,24,1,1,1,2721,3439,3
-2831,alias_default_694,call_function,alias.default,forward,24,1,1,3,2722,3438,3
-2832,mul_172,call_function,mul.Tensor,forward,24,2,2,1,2723,3434,8
-2833,alias_default_692,call_function,alias.default,forward,24,1,1,2,2,3435,2
-2834,mul_173,call_function,mul.Tensor,forward,24,2,2,1,2727,3433,8
-2835,convert_element_type_591,call_function,convert_element_type.default,forward,24,1,1,1,2728,3432,6
-2836,dtype_cast_223,call_function,dtype_cast.default,forward,24,1,1,1,1,3432,3
-2837,permute_272,call_function,permute.default,forward,24,1,1,1,2,3431,3
-2838,alias_default_695,call_function,alias.default,forward,24,1,1,4,2729,3431,4
-2839,alias_default_696,call_function,alias.default,forward,24,1,1,2,3,3430,3
-2840,einsum_default_172,call_function,einsum.default,forward,24,2,2,1,2734,3428,5
-2841,alias_default_697,call_function,alias.default,forward,24,1,1,2,2735,3427,4
-2842,convert_element_type_594,call_function,convert_element_type.default,forward,24,1,1,1,2736,3415,4
-2843,alias_default_698,call_function,alias.default,forward,24,1,1,2,2737,3414,4
-2844,neg_24,call_function,neg.default,forward,24,1,1,1,2738,3413,8
-2845,exp_24,call_function,exp.default,forward,24,1,1,1,2739,3412,6
-2846,add_123,call_function,add.Tensor,forward,24,1,1,1,2740,3411,4
-2847,div_24,call_function,div.Tensor,forward,24,2,2,1,2741,3410,6
-2848,convert_element_type_595,call_function,convert_element_type.default,forward,24,1,1,1,2742,3409,6
-2849,dtype_cast_224,call_function,dtype_cast.default,forward,24,1,1,1,1,3413,3
-2850,permute_273,call_function,permute.default,forward,24,1,1,1,2,3412,3
-2851,alias_default_700,call_function,alias.default,forward,24,1,1,2,3,3411,3
-2852,einsum_default_173,call_function,einsum.default,forward,24,2,2,1,2734,3409,5
-2853,alias_default_699,call_function,alias.default,forward,24,1,1,2,2743,3408,4
-2854,alias_default_701,call_function,alias.default,forward,24,1,1,2,2735,3408,4
-2855,mul_174,call_function,mul.Tensor,forward,24,2,2,1,2750,3407,8
-2856,dtype_cast_225,call_function,dtype_cast.default,forward,24,1,1,1,1,3409,3
-2857,permute_274,call_function,permute.default,forward,24,1,1,1,2,3408,3
-2858,alias_default_702,call_function,alias.default,forward,24,1,1,2,2751,3406,4
-2859,alias_default_703,call_function,alias.default,forward,24,1,1,2,3,3407,3
-2860,einsum_default_174,call_function,einsum.default,forward,24,2,2,1,2756,3405,5
-2861,add_124,call_function,add.Tensor,forward,24,2,2,1,2757,3404,10
-2862,dtype_cast_226,call_function,dtype_cast.default,forward,25,1,1,1,1,3393,2
-2863,alias_default_704,call_function,alias.default,forward,24,1,1,3,2758,3403,4
-2864,convert_element_type_600,call_function,convert_element_type.default,forward,25,1,1,1,2759,3401,4
-2865,alias_default_706,call_function,alias.default,forward,25,1,1,2,2760,3400,4
-2866,pow_51,call_function,pow.Tensor_Scalar,forward,25,1,1,1,2761,3399,4
-2867,mean_50,call_function,mean.dim,forward,25,1,1,1,2762,3398,4
-2868,add_125,call_function,add.Scalar,forward,25,1,1,1,2763,3397,3
-2869,rsqrt_50,call_function,rsqrt.default,forward,25,1,1,1,2764,3396,3
-2870,alias_default_707,call_function,alias.default,forward,25,1,1,3,2765,3395,3
-2871,mul_175,call_function,mul.Tensor,forward,25,2,2,1,2766,3391,8
-2872,alias_default_705,call_function,alias.default,forward,25,1,1,2,2,3392,2
-2873,mul_176,call_function,mul.Tensor,forward,25,2,2,1,2770,3390,8
-2874,convert_element_type_601,call_function,convert_element_type.default,forward,25,1,1,1,2771,3389,6
-2875,dtype_cast_227,call_function,dtype_cast.default,forward,25,1,1,1,1,3376,3
-2876,permute_275,call_function,permute.default,forward,25,1,1,1,2,3375,3
-2877,alias_default_708,call_function,alias.default,forward,25,1,1,6,2772,3388,4
-2878,alias_default_709,call_function,alias.default,forward,25,1,1,2,3,3374,3
-2879,einsum_default_175,call_function,einsum.default,forward,25,2,2,1,2777,3372,5
-2880,dtype_cast_228,call_function,dtype_cast.default,forward,25,1,1,1,1,3376,3
-2881,permute_276,call_function,permute.default,forward,25,1,1,1,2,3375,3
-2882,alias_default_710,call_function,alias.default,forward,25,1,1,2,3,3374,3
-2883,einsum_default_176,call_function,einsum.default,forward,25,2,2,1,2777,3372,5
-2884,dtype_cast_229,call_function,dtype_cast.default,forward,25,1,1,1,1,3369,3
-2885,permute_277,call_function,permute.default,forward,25,1,1,1,2,3368,3
-2886,alias_default_711,call_function,alias.default,forward,25,1,1,2,3,3367,3
-2887,einsum_default_177,call_function,einsum.default,forward,25,2,2,1,2777,3365,5
-2888,view_581,call_function,view.default,forward,25,1,1,1,2778,3371,4
-2889,view_582,call_function,view.default,forward,25,1,1,1,2778,3371,4
-2890,view_583,call_function,view.default,forward,25,1,1,1,2778,3364,4
-2891,convert_element_type_608,call_function,convert_element_type.default,forward,25,1,1,1,2779,3370,4
-2892,view_584,call_function,view.default,forward,25,1,1,1,2780,3369,4
-2893,view_as_complex_50,call_function,view_as_complex.default,forward,25,1,1,1,2781,3368,6
-2894,convert_element_type_609,call_function,convert_element_type.default,forward,25,1,1,1,2779,3370,4
-2895,view_585,call_function,view.default,forward,25,1,1,1,2780,3369,4
-2896,view_as_complex_51,call_function,view_as_complex.default,forward,25,1,1,1,2781,3368,6
-2897,view_586,call_function,view.default,forward,25,1,1,1,2,3379,3
-2898,alias_default_712,call_function,alias.default,forward,25,1,1,4,3,3378,3
-2899,mul_177,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8
-2900,view_as_real_50,call_function,view_as_real.default,forward,25,1,1,1,2785,3366,6
-2901,view_587,call_function,view.default,forward,25,1,1,1,2786,3365,6
-2902,mul_178,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8
-2903,view_as_real_51,call_function,view_as_real.default,forward,25,1,1,1,2785,3366,6
-2904,view_588,call_function,view.default,forward,25,1,1,1,2786,3365,6
-2905,convert_element_type_610,call_function,convert_element_type.default,forward,25,1,1,1,2787,3364,6
-2906,convert_element_type_611,call_function,convert_element_type.default,forward,25,1,1,1,2787,3364,6
-2907,permute_278,call_function,permute.default,forward,25,1,1,1,2788,3363,6
-2908,permute_279,call_function,permute.default,forward,25,1,1,1,2788,3363,6
-2909,permute_280,call_function,permute.default,forward,25,1,1,1,2779,3363,4
-2910,alias_default_713,call_function,alias.default,forward,25,1,1,2,2789,3362,4
-2911,alias_default_714,call_function,alias.default,forward,25,1,1,2,2789,3362,4
-2912,alias_default_715,call_function,alias.default,forward,25,1,1,2,2780,3362,4
-2913,_scaled_dot_product_flash_attention_25,call_function,_scaled_dot_product_flash_attention.default,forward,25,3,3,4,2813,3361,2
-2914,getitem_225,call_function,getitem,forward,25,1,1,1,2814,3357,2
-2915,getitem_226,call_function,getitem,forward,25,1,1,1,2814,2814,2
-2916,getitem_231,call_function,getitem,forward,25,1,1,1,2814,2814,1
-2917,getitem_232,call_function,getitem,forward,25,1,1,1,2814,2814,1
-2918,alias_default_716,call_function,alias.default,forward,25,1,1,2,2815,3356,4
-2919,permute_281,call_function,permute.default,forward,25,1,1,1,2816,3355,4
-2920,view_589,call_function,view.default,forward,25,1,1,1,2817,3354,3
-2921,dtype_cast_230,call_function,dtype_cast.default,forward,25,1,1,1,1,3356,3
-2922,permute_282,call_function,permute.default,forward,25,1,1,1,2,3355,3
-2923,alias_default_717,call_function,alias.default,forward,25,1,1,2,2818,3353,4
-2924,alias_default_718,call_function,alias.default,forward,25,1,1,2,3,3354,3
-2925,einsum_default_178,call_function,einsum.default,forward,25,2,2,1,2823,3352,5
-2926,add_126,call_function,add.Tensor,forward,25,2,2,1,2824,3351,10
-2927,dtype_cast_231,call_function,dtype_cast.default,forward,25,1,1,1,1,3340,2
-2928,alias_default_719,call_function,alias.default,forward,25,1,1,3,2825,3350,4
-2929,convert_element_type_614,call_function,convert_element_type.default,forward,25,1,1,1,2826,3348,4
-2930,alias_default_721,call_function,alias.default,forward,25,1,1,2,2827,3347,4
-2931,pow_52,call_function,pow.Tensor_Scalar,forward,25,1,1,1,2828,3346,4
-2932,mean_51,call_function,mean.dim,forward,25,1,1,1,2829,3345,4
-2933,add_127,call_function,add.Scalar,forward,25,1,1,1,2830,3344,3
-2934,rsqrt_51,call_function,rsqrt.default,forward,25,1,1,1,2831,3343,3
-2935,alias_default_722,call_function,alias.default,forward,25,1,1,3,2832,3342,3
-2936,mul_179,call_function,mul.Tensor,forward,25,2,2,1,2833,3338,8
-2937,alias_default_720,call_function,alias.default,forward,25,1,1,2,2,3339,2
-2938,mul_180,call_function,mul.Tensor,forward,25,2,2,1,2837,3337,8
-2939,convert_element_type_615,call_function,convert_element_type.default,forward,25,1,1,1,2838,3336,6
-2940,dtype_cast_232,call_function,dtype_cast.default,forward,25,1,1,1,1,3336,3
-2941,permute_283,call_function,permute.default,forward,25,1,1,1,2,3335,3
-2942,alias_default_723,call_function,alias.default,forward,25,1,1,4,2839,3335,4
-2943,alias_default_724,call_function,alias.default,forward,25,1,1,2,3,3334,3
-2944,einsum_default_179,call_function,einsum.default,forward,25,2,2,1,2844,3332,5
-2945,alias_default_725,call_function,alias.default,forward,25,1,1,2,2845,3331,4
-2946,convert_element_type_618,call_function,convert_element_type.default,forward,25,1,1,1,2846,3319,4
-2947,alias_default_726,call_function,alias.default,forward,25,1,1,2,2847,3318,4
-2948,neg_25,call_function,neg.default,forward,25,1,1,1,2848,3317,8
-2949,exp_25,call_function,exp.default,forward,25,1,1,1,2849,3316,6
-2950,add_128,call_function,add.Tensor,forward,25,1,1,1,2850,3315,4
-2951,div_25,call_function,div.Tensor,forward,25,2,2,1,2851,3314,6
-2952,convert_element_type_619,call_function,convert_element_type.default,forward,25,1,1,1,2852,3313,6
-2953,dtype_cast_233,call_function,dtype_cast.default,forward,25,1,1,1,1,3317,3
-2954,permute_284,call_function,permute.default,forward,25,1,1,1,2,3316,3
-2955,alias_default_728,call_function,alias.default,forward,25,1,1,2,3,3315,3
-2956,einsum_default_180,call_function,einsum.default,forward,25,2,2,1,2844,3313,5
-2957,alias_default_727,call_function,alias.default,forward,25,1,1,2,2853,3312,4
-2958,alias_default_729,call_function,alias.default,forward,25,1,1,2,2845,3312,4
-2959,mul_181,call_function,mul.Tensor,forward,25,2,2,1,2860,3311,8
-2960,dtype_cast_234,call_function,dtype_cast.default,forward,25,1,1,1,1,3313,3
-2961,permute_285,call_function,permute.default,forward,25,1,1,1,2,3312,3
-2962,alias_default_730,call_function,alias.default,forward,25,1,1,2,2861,3310,4
-2963,alias_default_731,call_function,alias.default,forward,25,1,1,2,3,3311,3
-2964,einsum_default_181,call_function,einsum.default,forward,25,2,2,1,2866,3309,5
-2965,add_129,call_function,add.Tensor,forward,25,2,2,1,2867,3308,10
-2966,dtype_cast_235,call_function,dtype_cast.default,forward,26,1,1,1,1,3297,2
-2967,alias_default_732,call_function,alias.default,forward,25,1,1,3,2868,3307,4
-2968,convert_element_type_624,call_function,convert_element_type.default,forward,26,1,1,1,2869,3305,4
-2969,alias_default_734,call_function,alias.default,forward,26,1,1,2,2870,3304,4
-2970,pow_53,call_function,pow.Tensor_Scalar,forward,26,1,1,1,2871,3303,4
-2971,mean_52,call_function,mean.dim,forward,26,1,1,1,2872,3302,4
-2972,add_130,call_function,add.Scalar,forward,26,1,1,1,2873,3301,3
-2973,rsqrt_52,call_function,rsqrt.default,forward,26,1,1,1,2874,3300,3
-2974,alias_default_735,call_function,alias.default,forward,26,1,1,3,2875,3299,3
-2975,mul_182,call_function,mul.Tensor,forward,26,2,2,1,2876,3295,8
-2976,alias_default_733,call_function,alias.default,forward,26,1,1,2,2,3296,2
-2977,mul_183,call_function,mul.Tensor,forward,26,2,2,1,2880,3294,8
-2978,convert_element_type_625,call_function,convert_element_type.default,forward,26,1,1,1,2881,3293,6
-2979,dtype_cast_236,call_function,dtype_cast.default,forward,26,1,1,1,1,3280,3
-2980,permute_286,call_function,permute.default,forward,26,1,1,1,2,3279,3
-2981,alias_default_736,call_function,alias.default,forward,26,1,1,6,2882,3292,4
-2982,alias_default_737,call_function,alias.default,forward,26,1,1,2,3,3278,3
-2983,einsum_default_182,call_function,einsum.default,forward,26,2,2,1,2887,3276,5
-2984,dtype_cast_237,call_function,dtype_cast.default,forward,26,1,1,1,1,3280,3
-2985,permute_287,call_function,permute.default,forward,26,1,1,1,2,3279,3
-2986,alias_default_738,call_function,alias.default,forward,26,1,1,2,3,3278,3
-2987,einsum_default_183,call_function,einsum.default,forward,26,2,2,1,2887,3276,5
-2988,dtype_cast_238,call_function,dtype_cast.default,forward,26,1,1,1,1,3273,3
-2989,permute_288,call_function,permute.default,forward,26,1,1,1,2,3272,3
-2990,alias_default_739,call_function,alias.default,forward,26,1,1,2,3,3271,3
-2991,einsum_default_184,call_function,einsum.default,forward,26,2,2,1,2887,3269,5
-2992,view_604,call_function,view.default,forward,26,1,1,1,2888,3275,4
-2993,view_605,call_function,view.default,forward,26,1,1,1,2888,3275,4
-2994,view_606,call_function,view.default,forward,26,1,1,1,2888,3268,4
-2995,convert_element_type_632,call_function,convert_element_type.default,forward,26,1,1,1,2889,3274,4
-2996,view_607,call_function,view.default,forward,26,1,1,1,2890,3273,4
-2997,view_as_complex_52,call_function,view_as_complex.default,forward,26,1,1,1,2891,3272,6
-2998,convert_element_type_633,call_function,convert_element_type.default,forward,26,1,1,1,2889,3274,4
-2999,view_608,call_function,view.default,forward,26,1,1,1,2890,3273,4
-3000,view_as_complex_53,call_function,view_as_complex.default,forward,26,1,1,1,2891,3272,6
-3001,view_609,call_function,view.default,forward,26,1,1,1,2,3283,3
-3002,alias_default_740,call_function,alias.default,forward,26,1,1,4,3,3282,3
-3003,mul_184,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8
-3004,view_as_real_52,call_function,view_as_real.default,forward,26,1,1,1,2895,3270,6
-3005,view_610,call_function,view.default,forward,26,1,1,1,2896,3269,6
-3006,mul_185,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8
-3007,view_as_real_53,call_function,view_as_real.default,forward,26,1,1,1,2895,3270,6
-3008,view_611,call_function,view.default,forward,26,1,1,1,2896,3269,6
-3009,convert_element_type_634,call_function,convert_element_type.default,forward,26,1,1,1,2897,3268,6
-3010,convert_element_type_635,call_function,convert_element_type.default,forward,26,1,1,1,2897,3268,6
-3011,permute_289,call_function,permute.default,forward,26,1,1,1,2898,3267,6
-3012,permute_290,call_function,permute.default,forward,26,1,1,1,2898,3267,6
-3013,permute_291,call_function,permute.default,forward,26,1,1,1,2889,3267,4
-3014,alias_default_741,call_function,alias.default,forward,26,1,1,2,2899,3266,4
-3015,alias_default_742,call_function,alias.default,forward,26,1,1,2,2899,3266,4
-3016,alias_default_743,call_function,alias.default,forward,26,1,1,2,2890,3266,4
-3017,_scaled_dot_product_flash_attention_26,call_function,_scaled_dot_product_flash_attention.default,forward,26,3,3,4,2923,3265,2
-3018,getitem_234,call_function,getitem,forward,26,1,1,1,2924,3261,2
-3019,getitem_235,call_function,getitem,forward,26,1,1,1,2924,2924,2
-3020,getitem_240,call_function,getitem,forward,26,1,1,1,2924,2924,1
-3021,getitem_241,call_function,getitem,forward,26,1,1,1,2924,2924,1
-3022,alias_default_744,call_function,alias.default,forward,26,1,1,2,2925,3260,4
-3023,permute_292,call_function,permute.default,forward,26,1,1,1,2926,3259,4
-3024,view_612,call_function,view.default,forward,26,1,1,1,2927,3258,3
-3025,dtype_cast_239,call_function,dtype_cast.default,forward,26,1,1,1,1,3260,3
-3026,permute_293,call_function,permute.default,forward,26,1,1,1,2,3259,3
-3027,alias_default_745,call_function,alias.default,forward,26,1,1,2,2928,3257,4
-3028,alias_default_746,call_function,alias.default,forward,26,1,1,2,3,3258,3
-3029,einsum_default_185,call_function,einsum.default,forward,26,2,2,1,2933,3256,5
-3030,add_131,call_function,add.Tensor,forward,26,2,2,1,2934,3255,10
-3031,dtype_cast_240,call_function,dtype_cast.default,forward,26,1,1,1,1,3244,2
-3032,alias_default_747,call_function,alias.default,forward,26,1,1,3,2935,3254,4
-3033,convert_element_type_638,call_function,convert_element_type.default,forward,26,1,1,1,2936,3252,4
-3034,alias_default_749,call_function,alias.default,forward,26,1,1,2,2937,3251,4
-3035,pow_54,call_function,pow.Tensor_Scalar,forward,26,1,1,1,2938,3250,4
-3036,mean_53,call_function,mean.dim,forward,26,1,1,1,2939,3249,4
-3037,add_132,call_function,add.Scalar,forward,26,1,1,1,2940,3248,3
-3038,rsqrt_53,call_function,rsqrt.default,forward,26,1,1,1,2941,3247,3
-3039,alias_default_750,call_function,alias.default,forward,26,1,1,3,2942,3246,3
-3040,mul_186,call_function,mul.Tensor,forward,26,2,2,1,2943,3242,8
-3041,alias_default_748,call_function,alias.default,forward,26,1,1,2,2,3243,2
-3042,mul_187,call_function,mul.Tensor,forward,26,2,2,1,2947,3241,8
-3043,convert_element_type_639,call_function,convert_element_type.default,forward,26,1,1,1,2948,3240,6
-3044,dtype_cast_241,call_function,dtype_cast.default,forward,26,1,1,1,1,3240,3
-3045,permute_294,call_function,permute.default,forward,26,1,1,1,2,3239,3
-3046,alias_default_751,call_function,alias.default,forward,26,1,1,4,2949,3239,4
-3047,alias_default_752,call_function,alias.default,forward,26,1,1,2,3,3238,3
-3048,einsum_default_186,call_function,einsum.default,forward,26,2,2,1,2954,3236,5
-3049,alias_default_753,call_function,alias.default,forward,26,1,1,2,2955,3235,4
-3050,convert_element_type_642,call_function,convert_element_type.default,forward,26,1,1,1,2956,3223,4
-3051,alias_default_754,call_function,alias.default,forward,26,1,1,2,2957,3222,4
-3052,neg_26,call_function,neg.default,forward,26,1,1,1,2958,3221,8
-3053,exp_26,call_function,exp.default,forward,26,1,1,1,2959,3220,6
-3054,add_133,call_function,add.Tensor,forward,26,1,1,1,2960,3219,4
-3055,div_26,call_function,div.Tensor,forward,26,2,2,1,2961,3218,6
-3056,convert_element_type_643,call_function,convert_element_type.default,forward,26,1,1,1,2962,3217,6
-3057,dtype_cast_242,call_function,dtype_cast.default,forward,26,1,1,1,1,3221,3
-3058,permute_295,call_function,permute.default,forward,26,1,1,1,2,3220,3
-3059,alias_default_756,call_function,alias.default,forward,26,1,1,2,3,3219,3
-3060,einsum_default_187,call_function,einsum.default,forward,26,2,2,1,2954,3217,5
-3061,alias_default_755,call_function,alias.default,forward,26,1,1,2,2963,3216,4
-3062,alias_default_757,call_function,alias.default,forward,26,1,1,2,2955,3216,4
-3063,mul_188,call_function,mul.Tensor,forward,26,2,2,1,2970,3215,8
-3064,dtype_cast_243,call_function,dtype_cast.default,forward,26,1,1,1,1,3217,3
-3065,permute_296,call_function,permute.default,forward,26,1,1,1,2,3216,3
-3066,alias_default_758,call_function,alias.default,forward,26,1,1,2,2971,3214,4
-3067,alias_default_759,call_function,alias.default,forward,26,1,1,2,3,3215,3
-3068,einsum_default_188,call_function,einsum.default,forward,26,2,2,1,2976,3213,5
-3069,add_134,call_function,add.Tensor,forward,26,2,2,1,2977,3212,10
-3070,dtype_cast_244,call_function,dtype_cast.default,forward,27,1,1,1,1,3201,2
-3071,alias_default_760,call_function,alias.default,forward,26,1,1,3,2978,3211,4
-3072,convert_element_type_648,call_function,convert_element_type.default,forward,27,1,1,1,2979,3209,4
-3073,alias_default_762,call_function,alias.default,forward,27,1,1,2,2980,3208,4
-3074,pow_55,call_function,pow.Tensor_Scalar,forward,27,1,1,1,2981,3207,4
-3075,mean_54,call_function,mean.dim,forward,27,1,1,1,2982,3206,4
-3076,add_135,call_function,add.Scalar,forward,27,1,1,1,2983,3205,3
-3077,rsqrt_54,call_function,rsqrt.default,forward,27,1,1,1,2984,3204,3
-3078,alias_default_763,call_function,alias.default,forward,27,1,1,3,2985,3203,3
-3079,mul_189,call_function,mul.Tensor,forward,27,2,2,1,2986,3199,8
-3080,alias_default_761,call_function,alias.default,forward,27,1,1,2,2,3200,2
-3081,mul_190,call_function,mul.Tensor,forward,27,2,2,1,2990,3198,8
-3082,convert_element_type_649,call_function,convert_element_type.default,forward,27,1,1,1,2991,3197,6
-3083,dtype_cast_245,call_function,dtype_cast.default,forward,27,1,1,1,1,3184,3
-3084,permute_297,call_function,permute.default,forward,27,1,1,1,2,3183,3
-3085,alias_default_764,call_function,alias.default,forward,27,1,1,6,2992,3196,4
-3086,alias_default_765,call_function,alias.default,forward,27,1,1,2,3,3182,3
-3087,einsum_default_189,call_function,einsum.default,forward,27,2,2,1,2997,3180,5
-3088,dtype_cast_246,call_function,dtype_cast.default,forward,27,1,1,1,1,3184,3
-3089,permute_298,call_function,permute.default,forward,27,1,1,1,2,3183,3
-3090,alias_default_766,call_function,alias.default,forward,27,1,1,2,3,3182,3
-3091,einsum_default_190,call_function,einsum.default,forward,27,2,2,1,2997,3180,5
-3092,dtype_cast_247,call_function,dtype_cast.default,forward,27,1,1,1,1,3177,3
-3093,permute_299,call_function,permute.default,forward,27,1,1,1,2,3176,3
-3094,alias_default_767,call_function,alias.default,forward,27,1,1,2,3,3175,3
-3095,einsum_default_191,call_function,einsum.default,forward,27,2,2,1,2997,3173,5
-3096,view_627,call_function,view.default,forward,27,1,1,1,2998,3179,4
-3097,view_628,call_function,view.default,forward,27,1,1,1,2998,3179,4
-3098,view_629,call_function,view.default,forward,27,1,1,1,2998,3172,4
-3099,convert_element_type_656,call_function,convert_element_type.default,forward,27,1,1,1,2999,3178,4
-3100,view_630,call_function,view.default,forward,27,1,1,1,3000,3177,4
-3101,view_as_complex_54,call_function,view_as_complex.default,forward,27,1,1,1,3001,3176,6
-3102,convert_element_type_657,call_function,convert_element_type.default,forward,27,1,1,1,2999,3178,4
-3103,view_631,call_function,view.default,forward,27,1,1,1,3000,3177,4
-3104,view_as_complex_55,call_function,view_as_complex.default,forward,27,1,1,1,3001,3176,6
-3105,view_632,call_function,view.default,forward,27,1,1,1,2,3187,3
-3106,alias_default_768,call_function,alias.default,forward,27,1,1,4,3,3186,3
-3107,mul_191,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8
-3108,view_as_real_54,call_function,view_as_real.default,forward,27,1,1,1,3005,3174,6
-3109,view_633,call_function,view.default,forward,27,1,1,1,3006,3173,6
-3110,mul_192,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8
-3111,view_as_real_55,call_function,view_as_real.default,forward,27,1,1,1,3005,3174,6
-3112,view_634,call_function,view.default,forward,27,1,1,1,3006,3173,6
-3113,convert_element_type_658,call_function,convert_element_type.default,forward,27,1,1,1,3007,3172,6
-3114,convert_element_type_659,call_function,convert_element_type.default,forward,27,1,1,1,3007,3172,6
-3115,permute_300,call_function,permute.default,forward,27,1,1,1,3008,3171,6
-3116,permute_301,call_function,permute.default,forward,27,1,1,1,3008,3171,6
-3117,permute_302,call_function,permute.default,forward,27,1,1,1,2999,3171,4
-3118,alias_default_769,call_function,alias.default,forward,27,1,1,2,3009,3170,4
-3119,alias_default_770,call_function,alias.default,forward,27,1,1,2,3009,3170,4
-3120,alias_default_771,call_function,alias.default,forward,27,1,1,2,3000,3170,4
-3121,_scaled_dot_product_flash_attention_27,call_function,_scaled_dot_product_flash_attention.default,forward,27,3,3,4,3033,3169,2
-3122,getitem_243,call_function,getitem,forward,27,1,1,1,3034,3165,2
-3123,getitem_244,call_function,getitem,forward,27,1,1,1,3034,3034,2
-3124,getitem_249,call_function,getitem,forward,27,1,1,1,3034,3034,1
-3125,getitem_250,call_function,getitem,forward,27,1,1,1,3034,3034,1
-3126,alias_default_772,call_function,alias.default,forward,27,1,1,2,3035,3164,4
-3127,permute_303,call_function,permute.default,forward,27,1,1,1,3036,3163,4
-3128,view_635,call_function,view.default,forward,27,1,1,1,3037,3162,3
-3129,dtype_cast_248,call_function,dtype_cast.default,forward,27,1,1,1,1,3164,3
-3130,permute_304,call_function,permute.default,forward,27,1,1,1,2,3163,3
-3131,alias_default_773,call_function,alias.default,forward,27,1,1,2,3038,3161,4
-3132,alias_default_774,call_function,alias.default,forward,27,1,1,2,3,3162,3
-3133,einsum_default_192,call_function,einsum.default,forward,27,2,2,1,3043,3160,5
-3134,add_136,call_function,add.Tensor,forward,27,2,2,1,3044,3159,10
-3135,dtype_cast_249,call_function,dtype_cast.default,forward,27,1,1,1,1,3148,2
-3136,alias_default_775,call_function,alias.default,forward,27,1,1,3,3045,3158,4
-3137,convert_element_type_662,call_function,convert_element_type.default,forward,27,1,1,1,3046,3156,4
-3138,alias_default_777,call_function,alias.default,forward,27,1,1,2,3047,3155,4
-3139,pow_56,call_function,pow.Tensor_Scalar,forward,27,1,1,1,3048,3154,4
-3140,mean_55,call_function,mean.dim,forward,27,1,1,1,3049,3153,4
-3141,add_137,call_function,add.Scalar,forward,27,1,1,1,3050,3152,3
-3142,rsqrt_55,call_function,rsqrt.default,forward,27,1,1,1,3051,3151,3
-3143,alias_default_778,call_function,alias.default,forward,27,1,1,3,3052,3150,3
-3144,mul_193,call_function,mul.Tensor,forward,27,2,2,1,3053,3146,8
-3145,alias_default_776,call_function,alias.default,forward,27,1,1,2,2,3147,2
-3146,mul_194,call_function,mul.Tensor,forward,27,2,2,1,3057,3145,8
-3147,convert_element_type_663,call_function,convert_element_type.default,forward,27,1,1,1,3058,3144,6
-3148,dtype_cast_250,call_function,dtype_cast.default,forward,27,1,1,1,1,3144,3
-3149,permute_305,call_function,permute.default,forward,27,1,1,1,2,3143,3
-3150,alias_default_779,call_function,alias.default,forward,27,1,1,4,3059,3143,4
-3151,alias_default_780,call_function,alias.default,forward,27,1,1,2,3,3142,3
-3152,einsum_default_193,call_function,einsum.default,forward,27,2,2,1,3064,3140,5
-3153,alias_default_781,call_function,alias.default,forward,27,1,1,2,3065,3139,4
-3154,convert_element_type_666,call_function,convert_element_type.default,forward,27,1,1,1,3066,3127,4
-3155,alias_default_782,call_function,alias.default,forward,27,1,1,2,3067,3126,4
-3156,neg_27,call_function,neg.default,forward,27,1,1,1,3068,3125,8
-3157,exp_27,call_function,exp.default,forward,27,1,1,1,3069,3124,6
-3158,add_138,call_function,add.Tensor,forward,27,1,1,1,3070,3123,4
-3159,div_27,call_function,div.Tensor,forward,27,2,2,1,3071,3122,6
-3160,convert_element_type_667,call_function,convert_element_type.default,forward,27,1,1,1,3072,3121,6
-3161,dtype_cast_251,call_function,dtype_cast.default,forward,27,1,1,1,1,3125,3
-3162,permute_306,call_function,permute.default,forward,27,1,1,1,2,3124,3
-3163,alias_default_784,call_function,alias.default,forward,27,1,1,2,3,3123,3
-3164,einsum_default_194,call_function,einsum.default,forward,27,2,2,1,3064,3121,5
-3165,alias_default_783,call_function,alias.default,forward,27,1,1,2,3073,3120,4
-3166,alias_default_785,call_function,alias.default,forward,27,1,1,2,3065,3120,4
-3167,mul_195,call_function,mul.Tensor,forward,27,2,2,1,3080,3119,8
-3168,dtype_cast_252,call_function,dtype_cast.default,forward,27,1,1,1,1,3121,3
-3169,permute_307,call_function,permute.default,forward,27,1,1,1,2,3120,3
-3170,alias_default_786,call_function,alias.default,forward,27,1,1,2,3081,3118,4
-3171,alias_default_787,call_function,alias.default,forward,27,1,1,2,3,3119,3
-3172,einsum_default_195,call_function,einsum.default,forward,27,2,2,1,3086,3117,5
-3173,add_139,call_function,add.Tensor,forward,27,2,2,1,3087,3116,10
-3174,dtype_cast_253,call_function,dtype_cast.default,forward,,1,1,1,1,3102,2
-3175,alias_default_788,call_function,alias.default,forward,27,1,1,2,3088,3115,4
-3176,convert_element_type_672,call_function,convert_element_type.default,forward,,1,1,1,3089,3113,4
-3177,alias_default_790,call_function,alias.default,forward,,1,1,2,3090,3112,4
-3178,pow_57,call_function,pow.Tensor_Scalar,forward,,1,1,1,3091,3111,4
-3179,mean_56,call_function,mean.dim,forward,,1,1,1,3092,3110,4
-3180,add_140,call_function,add.Scalar,forward,,1,1,1,3093,3109,3
-3181,rsqrt_56,call_function,rsqrt.default,forward,,1,1,1,3094,3108,3
-3182,alias_default_791,call_function,alias.default,forward,,1,1,3,3095,3107,3
-3183,mul_196,call_function,mul.Tensor,forward,,2,2,1,3096,10,8
-3184,alias_default_789,call_function,alias.default,forward,,1,1,2,2,3101,2
-3185,mul_197,call_function,mul.Tensor,forward,,2,2,1,3100,9,8
-3186,convert_element_type_673,call_function,convert_element_type.default,forward,,1,1,1,3101,8,6
-3187,dtype_cast_254,call_function,dtype_cast.default,forward,,1,1,1,2,3105,3
-3188,permute_308,call_function,permute.default,forward,,1,1,1,3,3104,3
-3189,alias_default_792,call_function,alias.default,forward,,1,1,2,3102,7,4
-3190,alias_default_793,call_function,alias.default,forward,,1,1,2,4,3103,3
-3191,einsum_default_196,call_function,einsum.default,forward,,2,2,1,3106,1,5
-3192,alias_default_1245,call_function,alias.default,forward,,1,1,0,3107,0,4
-3193,alias_default_3,call_function,alias.default,unknown,,1,1,2,1,3103,4
-3194,einsum_default_197,call_function,einsum.default,backward,,2,2,1,3105,4,5
-3195,permute_311,call_function,permute.default,backward,,1,1,1,5,3100,3
-3196,einsum_default_198,call_function,einsum.default,backward,,2,2,1,8,3099,5
-3197,permute_312,call_function,permute.default,backward,,1,1,1,3106,3,4
-3198,dtype_cast_255,call_function,dtype_cast.default,backward,,1,1,1,3107,2,4
-3199,convert_element_type_680,call_function,convert_element_type.default,backward,,1,1,1,9,3098,5
-3200,convert_element_type_681,call_function,convert_element_type.default,backward,,1,1,1,3089,3098,4
-3201,convert_element_type_682,call_function,convert_element_type.default,backward,,1,1,1,3,3092,2
-3202,alias_default_794,call_function,alias.default,backward,,1,1,2,10,3097,4
-3203,mul_198,call_function,mul.Tensor,backward,,2,2,1,15,3091,8
-3204,mul_199,call_function,mul.Tensor,backward,,2,2,1,3097,3097,8
-3205,alias_default_795,call_function,alias.default,backward,,1,1,2,16,3090,4
-3206,alias_default_796,call_function,alias.default,backward,,1,1,3,3098,3096,4
-3207,mul_200,call_function,mul.Tensor,backward,,2,2,1,3114,3089,8
-3208,sum_1,call_function,sum.dim_IntList,backward,,1,1,1,3115,3088,5
-3209,div_28,call_function,div.Tensor,backward,,1,1,1,3099,3088,6
-3210,mul_201,call_function,mul.Tensor,backward,,2,2,1,3117,3087,8
-3211,sub,call_function,sub.Tensor,backward,,2,2,1,3118,3086,10
-3212,mul_202,call_function,mul.Tensor,backward,,2,2,1,3119,3085,8
-3213,mul_203,call_function,mul.Tensor,backward,,2,2,1,3108,4,8
-3214,sum_2,call_function,sum.dim_IntList,backward,,1,1,1,3109,3,5
-3215,convert_element_type_683,call_function,convert_element_type.default,backward,,1,1,1,3120,3084,6
-3216,convert_element_type_684,call_function,convert_element_type.default,backward,,1,1,1,3110,2,3
-3217,dtype_cast_256,call_function,dtype_cast.default,backward,,1,1,1,3111,1,3
-3218,alias_default_1499,call_function,alias.default,backward,,1,1,0,3112,0,2
-3219,alias_default_797,call_function,alias.default,backward,,1,1,3,3121,3083,4
-3220,einsum_default_199,call_function,einsum.default,backward,27,2,2,1,3122,3,5
-3221,permute_315,call_function,permute.default,backward,27,1,1,1,4,3079,3
-3222,einsum_default_200,call_function,einsum.default,backward,27,2,2,1,3123,3078,5
-3223,permute_316,call_function,permute.default,backward,27,1,1,1,3123,2,4
-3224,dtype_cast_257,call_function,dtype_cast.default,backward,27,1,1,1,3124,1,4
-3225,alias_default_1495,call_function,alias.default,backward,27,1,1,0,3125,0,3
-3226,alias_default_798,call_function,alias.default,backward,27,1,1,2,3124,3077,4
-3227,mul_204,call_function,mul.Tensor,backward,27,2,2,1,3125,3065,8
-3228,mul_205,call_function,mul.Tensor,backward,27,2,2,1,3125,3069,8
-3229,alias_default_799,call_function,alias.default,backward,27,1,1,2,3126,3064,4
-3230,einsum_default_201,call_function,einsum.default,backward,27,2,2,1,3127,3,5
-3231,permute_319,call_function,permute.default,backward,27,1,1,1,4,3060,3
-3232,einsum_default_202,call_function,einsum.default,backward,27,2,2,1,3128,3059,5
-3233,permute_320,call_function,permute.default,backward,27,1,1,1,3128,2,4
-3234,dtype_cast_258,call_function,dtype_cast.default,backward,27,1,1,1,3129,1,4
-3235,alias_default_1496,call_function,alias.default,backward,27,1,1,0,3130,0,3
-3236,convert_element_type_693,call_function,convert_element_type.default,backward,27,1,1,1,3126,3068,6
-3237,convert_element_type_694,call_function,convert_element_type.default,backward,27,1,1,1,3066,3078,4
-3238,alias_default_800,call_function,alias.default,backward,27,1,1,2,3067,3077,4
-3239,neg_28,call_function,neg.default,backward,27,1,1,1,3068,3076,8
-3240,exp_28,call_function,exp.default,backward,27,1,1,1,3069,3075,6
-3241,add_141,call_function,add.Tensor,backward,27,1,1,1,3070,3074,4
-3242,reciprocal,call_function,reciprocal.default,backward,27,1,1,1,3071,3073,4
-3243,mul_206,call_function,mul.Tensor,backward,27,1,1,1,3072,3072,6
-3244,alias_default_801,call_function,alias.default,backward,27,1,1,2,3073,3071,4
-3245,mul_207,call_function,mul.Tensor,backward,27,2,2,1,3135,3067,8
-3246,sub_1,call_function,sub.Tensor,backward,27,1,1,1,3074,3069,4
-3247,mul_208,call_function,mul.Tensor,backward,27,2,2,1,3075,3068,8
-3248,add_142,call_function,add.Tensor,backward,27,1,1,1,3076,3067,4
-3249,mul_209,call_function,mul.Tensor,backward,27,2,2,1,3139,3066,8
-3250,convert_element_type_695,call_function,convert_element_type.default,backward,27,1,1,1,3140,3065,6
-3251,alias_default_802,call_function,alias.default,backward,27,1,1,2,3141,3064,4
-3252,einsum_default_203,call_function,einsum.default,backward,27,2,2,1,3142,3,5
-3253,permute_323,call_function,permute.default,backward,27,1,1,1,4,3060,3
-3254,einsum_default_204,call_function,einsum.default,backward,27,2,2,1,3143,3059,5
-3255,add_143,call_function,add.Tensor,unknown,,2,2,1,3148,3058,10
-3256,permute_324,call_function,permute.default,backward,27,1,1,1,3143,2,4
-3257,dtype_cast_259,call_function,dtype_cast.default,backward,27,1,1,1,3144,1,4
-3258,alias_default_1494,call_function,alias.default,backward,27,1,1,0,3145,0,3
-3259,convert_element_type_700,call_function,convert_element_type.default,backward,27,1,1,1,3149,3057,8
-3260,convert_element_type_701,call_function,convert_element_type.default,backward,27,1,1,1,3046,3057,4
-3261,convert_element_type_702,call_function,convert_element_type.default,backward,27,1,1,1,3,3051,2
-3262,alias_default_803,call_function,alias.default,backward,27,1,1,2,3150,3056,4
-3263,mul_210,call_function,mul.Tensor,backward,27,2,2,1,3152,3050,8
-3264,mul_211,call_function,mul.Tensor,backward,27,2,2,1,3054,3056,8
-3265,alias_default_804,call_function,alias.default,backward,27,1,1,2,3153,3049,4
-3266,alias_default_805,call_function,alias.default,backward,27,1,1,3,3055,3055,4
-3267,mul_212,call_function,mul.Tensor,backward,27,2,2,1,3157,3048,8
-3268,sum_3,call_function,sum.dim_IntList,backward,27,1,1,1,3158,3047,5
-3269,div_29,call_function,div.Tensor,backward,27,1,1,1,3056,3047,6
-3270,mul_213,call_function,mul.Tensor,backward,27,2,2,1,3160,3046,8
-3271,sub_2,call_function,sub.Tensor,backward,27,2,2,1,3161,3045,10
-3272,mul_214,call_function,mul.Tensor,backward,27,2,2,1,3162,3044,8
-3273,mul_215,call_function,mul.Tensor,backward,27,2,2,1,3154,4,8
-3274,sum_4,call_function,sum.dim_IntList,backward,27,1,1,1,3155,3,5
-3275,convert_element_type_703,call_function,convert_element_type.default,backward,27,1,1,1,3163,3043,6
-3276,convert_element_type_704,call_function,convert_element_type.default,backward,27,1,1,1,3156,2,3
-3277,add_144,call_function,add.Tensor,unknown,,2,2,1,3164,3042,10
-3278,dtype_cast_260,call_function,dtype_cast.default,backward,27,1,1,1,3157,1,3
-3279,alias_default_1498,call_function,alias.default,backward,27,1,1,0,3158,0,2
-3280,alias_default_806,call_function,alias.default,unknown,,1,1,3,3165,3041,4
-3281,einsum_default_205,call_function,einsum.default,backward,27,2,2,1,3166,3,5
-3282,permute_327,call_function,permute.default,backward,27,1,1,1,4,3037,3
-3283,einsum_default_206,call_function,einsum.default,backward,27,2,2,1,3167,3036,5
-3284,permute_328,call_function,permute.default,backward,27,1,1,1,3167,2,4
-3285,dtype_cast_261,call_function,dtype_cast.default,backward,27,1,1,1,3168,1,4
-3286,alias_default_1493,call_function,alias.default,backward,27,1,1,0,3169,0,3
-3287,view_656,call_function,view.default,backward,27,1,1,1,3168,3035,4
-3288,permute_329,call_function,permute.default,backward,27,1,1,1,3169,3034,4
-3289,_scaled_dot_product_flash_attention_backward,call_function,_scaled_dot_product_flash_attention_backward.default,backward,27,8,8,3,3173,3033,2
-3290,getitem_252,call_function,getitem,backward,27,1,1,1,3174,3006,2
-3291,getitem_253,call_function,getitem,backward,27,1,1,1,3174,3007,2
-3292,getitem_254,call_function,getitem,backward,27,1,1,1,3174,3000,2
-3293,permute_330,call_function,permute.default,backward,27,1,1,1,3175,2999,2
-3294,permute_331,call_function,permute.default,backward,27,1,1,1,3175,3006,2
-3295,permute_332,call_function,permute.default,backward,27,1,1,1,3175,3005,2
-3296,convert_element_type_709,call_function,convert_element_type.default,backward,27,1,1,1,3176,3005,2
-3297,convert_element_type_710,call_function,convert_element_type.default,backward,27,1,1,1,3176,3004,2
-3298,view_657,call_function,view.default,backward,27,1,1,1,3177,3004,2
-3299,view_as_complex_56,call_function,view_as_complex.default,backward,27,1,1,1,3178,3003,6
-3300,_conj,call_function,_conj.default,backward,27,1,1,1,4,3004,3
-3301,clone_6,call_function,clone.default,backward,27,1,1,1,5,3003,3
-3302,mul_216,call_function,mul.Tensor,backward,27,2,2,1,3181,3002,8
-3303,view_658,call_function,view.default,backward,27,1,1,1,3177,3003,2
-3304,view_as_complex_57,call_function,view_as_complex.default,backward,27,1,1,1,3178,3002,6
-3305,_conj_1,call_function,_conj.default,backward,27,1,1,1,4,3003,3
-3306,clone_7,call_function,clone.default,backward,27,1,1,1,5,3002,3
-3307,mul_217,call_function,mul.Tensor,backward,27,2,2,1,3181,3001,8
-3308,view_as_real_56,call_function,view_as_real.default,backward,27,1,1,1,3182,3001,6
-3309,view_659,call_function,view.default,backward,27,1,1,1,3183,3000,6
-3310,convert_element_type_711,call_function,convert_element_type.default,backward,27,1,1,1,3184,2999,6
-3311,view_as_real_57,call_function,view_as_real.default,backward,27,1,1,1,3182,3000,6
-3312,view_660,call_function,view.default,backward,27,1,1,1,3183,2999,6
-3313,convert_element_type_712,call_function,convert_element_type.default,backward,27,1,1,1,3184,2998,6
-3314,view_661,call_function,view.default,backward,27,1,1,1,3176,2998,2
-3315,view_662,call_function,view.default,backward,27,1,1,1,3185,2998,5
-3316,view_663,call_function,view.default,backward,27,1,1,1,3185,2997,5
-3317,alias_default_807,call_function,alias.default,backward,27,1,1,2,3177,2997,4
-3318,einsum_default_207,call_function,einsum.default,backward,27,2,2,1,3178,3,5
-3319,permute_335,call_function,permute.default,backward,27,1,1,1,4,2993,3
-3320,einsum_default_208,call_function,einsum.default,backward,27,2,2,1,3179,2992,5
-3321,permute_336,call_function,permute.default,backward,27,1,1,1,3179,2,4
-3322,dtype_cast_262,call_function,dtype_cast.default,backward,27,1,1,1,3180,1,4
-3323,alias_default_1492,call_function,alias.default,backward,27,1,1,0,3181,0,3
-3324,alias_default_808,call_function,alias.default,backward,27,1,1,2,3186,2997,4
-3325,einsum_default_209,call_function,einsum.default,backward,27,2,2,1,3187,3,5
-3326,permute_339,call_function,permute.default,backward,27,1,1,1,4,2993,3
-3327,einsum_default_210,call_function,einsum.default,backward,27,2,2,1,3188,2992,5
-3328,add_145,call_function,add.Tensor,unknown,,2,2,1,3195,2991,10
-3329,permute_340,call_function,permute.default,backward,27,1,1,1,3188,2,4
-3330,dtype_cast_263,call_function,dtype_cast.default,backward,27,1,1,1,3189,1,4
-3331,alias_default_1491,call_function,alias.default,backward,27,1,1,0,3190,0,3
-3332,alias_default_809,call_function,alias.default,backward,27,1,1,2,3186,2996,4
-3333,einsum_default_211,call_function,einsum.default,backward,27,2,2,1,3187,3,5
-3334,permute_343,call_function,permute.default,backward,27,1,1,1,4,2992,3
-3335,einsum_default_212,call_function,einsum.default,backward,27,2,2,1,3188,2991,5
-3336,add_146,call_function,add.Tensor,unknown,,2,2,1,3211,2990,10
-3337,permute_344,call_function,permute.default,backward,27,1,1,1,3188,2,4
-3338,dtype_cast_264,call_function,dtype_cast.default,backward,27,1,1,1,3189,1,4
-3339,alias_default_1490,call_function,alias.default,backward,27,1,1,0,3190,0,3
-3340,convert_element_type_725,call_function,convert_element_type.default,backward,27,1,1,1,3212,2989,8
-3341,convert_element_type_726,call_function,convert_element_type.default,backward,27,1,1,1,2979,2989,4
-3342,convert_element_type_727,call_function,convert_element_type.default,backward,27,1,1,1,3,2983,2
-3343,alias_default_810,call_function,alias.default,backward,27,1,1,2,3213,2988,4
-3344,mul_218,call_function,mul.Tensor,backward,27,2,2,1,3215,2982,8
-3345,mul_219,call_function,mul.Tensor,backward,27,2,2,1,2987,2988,8
-3346,alias_default_811,call_function,alias.default,backward,27,1,1,2,3216,2981,4
-3347,alias_default_812,call_function,alias.default,backward,27,1,1,3,2988,2987,4
-3348,mul_220,call_function,mul.Tensor,backward,27,2,2,1,3220,2980,8
-3349,sum_5,call_function,sum.dim_IntList,backward,27,1,1,1,3221,2979,5
-3350,div_30,call_function,div.Tensor,backward,27,1,1,1,2989,2979,6
-3351,mul_221,call_function,mul.Tensor,backward,27,2,2,1,3223,2978,8
-3352,sub_3,call_function,sub.Tensor,backward,27,2,2,1,3224,2977,10
-3353,mul_222,call_function,mul.Tensor,backward,27,2,2,1,3225,2976,8
-3354,mul_223,call_function,mul.Tensor,backward,27,2,2,1,3217,4,8
-3355,sum_6,call_function,sum.dim_IntList,backward,27,1,1,1,3218,3,5
-3356,convert_element_type_728,call_function,convert_element_type.default,backward,27,1,1,1,3226,2975,6
-3357,convert_element_type_729,call_function,convert_element_type.default,backward,27,1,1,1,3219,2,3
-3358,add_147,call_function,add.Tensor,unknown,,2,2,1,3227,2974,10
-3359,dtype_cast_265,call_function,dtype_cast.default,backward,27,1,1,1,3220,1,3
-3360,alias_default_1497,call_function,alias.default,backward,27,1,1,0,3221,0,2
-3361,alias_default_813,call_function,alias.default,unknown,,1,1,3,3228,2973,4
-3362,einsum_default_213,call_function,einsum.default,backward,26,2,2,1,3229,3,5
-3363,permute_347,call_function,permute.default,backward,26,1,1,1,4,2969,3
-3364,einsum_default_214,call_function,einsum.default,backward,26,2,2,1,3230,2968,5
-3365,permute_348,call_function,permute.default,backward,26,1,1,1,3230,2,4
-3366,dtype_cast_266,call_function,dtype_cast.default,backward,26,1,1,1,3231,1,4
-3367,alias_default_1486,call_function,alias.default,backward,26,1,1,0,3232,0,3
-3368,alias_default_814,call_function,alias.default,backward,26,1,1,2,3231,2967,4
-3369,mul_224,call_function,mul.Tensor,backward,26,2,2,1,3232,2955,8
-3370,mul_225,call_function,mul.Tensor,backward,26,2,2,1,3232,2959,8
-3371,alias_default_815,call_function,alias.default,backward,26,1,1,2,3233,2954,4
-3372,einsum_default_215,call_function,einsum.default,backward,26,2,2,1,3234,3,5
-3373,permute_351,call_function,permute.default,backward,26,1,1,1,4,2950,3
-3374,einsum_default_216,call_function,einsum.default,backward,26,2,2,1,3235,2949,5
-3375,permute_352,call_function,permute.default,backward,26,1,1,1,3235,2,4
-3376,dtype_cast_267,call_function,dtype_cast.default,backward,26,1,1,1,3236,1,4
-3377,alias_default_1487,call_function,alias.default,backward,26,1,1,0,3237,0,3
-3378,convert_element_type_738,call_function,convert_element_type.default,backward,26,1,1,1,3233,2958,6
-3379,convert_element_type_739,call_function,convert_element_type.default,backward,26,1,1,1,2956,2968,4
-3380,alias_default_816,call_function,alias.default,backward,26,1,1,2,2957,2967,4
-3381,neg_29,call_function,neg.default,backward,26,1,1,1,2958,2966,8
-3382,exp_29,call_function,exp.default,backward,26,1,1,1,2959,2965,6
-3383,add_148,call_function,add.Tensor,backward,26,1,1,1,2960,2964,4
-3384,reciprocal_1,call_function,reciprocal.default,backward,26,1,1,1,2961,2963,4
-3385,mul_226,call_function,mul.Tensor,backward,26,1,1,1,2962,2962,6
-3386,alias_default_817,call_function,alias.default,backward,26,1,1,2,2963,2961,4
-3387,mul_227,call_function,mul.Tensor,backward,26,2,2,1,3242,2957,8
-3388,sub_4,call_function,sub.Tensor,backward,26,1,1,1,2964,2959,4
-3389,mul_228,call_function,mul.Tensor,backward,26,2,2,1,2965,2958,8
-3390,add_149,call_function,add.Tensor,backward,26,1,1,1,2966,2957,4
-3391,mul_229,call_function,mul.Tensor,backward,26,2,2,1,3246,2956,8
-3392,convert_element_type_740,call_function,convert_element_type.default,backward,26,1,1,1,3247,2955,6
-3393,alias_default_818,call_function,alias.default,backward,26,1,1,2,3248,2954,4
-3394,einsum_default_217,call_function,einsum.default,backward,26,2,2,1,3249,3,5
-3395,permute_355,call_function,permute.default,backward,26,1,1,1,4,2950,3
-3396,einsum_default_218,call_function,einsum.default,backward,26,2,2,1,3250,2949,5
-3397,add_150,call_function,add.Tensor,unknown,,2,2,1,3255,2948,10
-3398,permute_356,call_function,permute.default,backward,26,1,1,1,3250,2,4
-3399,dtype_cast_268,call_function,dtype_cast.default,backward,26,1,1,1,3251,1,4
-3400,alias_default_1485,call_function,alias.default,backward,26,1,1,0,3252,0,3
-3401,convert_element_type_745,call_function,convert_element_type.default,backward,26,1,1,1,3256,2947,8
-3402,convert_element_type_746,call_function,convert_element_type.default,backward,26,1,1,1,2936,2947,4
-3403,convert_element_type_747,call_function,convert_element_type.default,backward,26,1,1,1,3,2941,2
-3404,alias_default_819,call_function,alias.default,backward,26,1,1,2,3257,2946,4
-3405,mul_230,call_function,mul.Tensor,backward,26,2,2,1,3259,2940,8
-3406,mul_231,call_function,mul.Tensor,backward,26,2,2,1,2944,2946,8
-3407,alias_default_820,call_function,alias.default,backward,26,1,1,2,3260,2939,4
-3408,alias_default_821,call_function,alias.default,backward,26,1,1,3,2945,2945,4
-3409,mul_232,call_function,mul.Tensor,backward,26,2,2,1,3264,2938,8
-3410,sum_7,call_function,sum.dim_IntList,backward,26,1,1,1,3265,2937,5
-3411,div_31,call_function,div.Tensor,backward,26,1,1,1,2946,2937,6
-3412,mul_233,call_function,mul.Tensor,backward,26,2,2,1,3267,2936,8
-3413,sub_5,call_function,sub.Tensor,backward,26,2,2,1,3268,2935,10
-3414,mul_234,call_function,mul.Tensor,backward,26,2,2,1,3269,2934,8
-3415,mul_235,call_function,mul.Tensor,backward,26,2,2,1,3261,4,8
-3416,sum_8,call_function,sum.dim_IntList,backward,26,1,1,1,3262,3,5
-3417,convert_element_type_748,call_function,convert_element_type.default,backward,26,1,1,1,3270,2933,6
-3418,convert_element_type_749,call_function,convert_element_type.default,backward,26,1,1,1,3263,2,3
-3419,add_151,call_function,add.Tensor,unknown,,2,2,1,3271,2932,10
-3420,dtype_cast_269,call_function,dtype_cast.default,backward,26,1,1,1,3264,1,3
-3421,alias_default_1489,call_function,alias.default,backward,26,1,1,0,3265,0,2
-3422,alias_default_822,call_function,alias.default,unknown,,1,1,3,3272,2931,4
-3423,einsum_default_219,call_function,einsum.default,backward,26,2,2,1,3273,3,5
-3424,permute_359,call_function,permute.default,backward,26,1,1,1,4,2927,3
-3425,einsum_default_220,call_function,einsum.default,backward,26,2,2,1,3274,2926,5
-3426,permute_360,call_function,permute.default,backward,26,1,1,1,3274,2,4
-3427,dtype_cast_270,call_function,dtype_cast.default,backward,26,1,1,1,3275,1,4
-3428,alias_default_1484,call_function,alias.default,backward,26,1,1,0,3276,0,3
-3429,view_678,call_function,view.default,backward,26,1,1,1,3275,2925,4
-3430,permute_361,call_function,permute.default,backward,26,1,1,1,3276,2924,4
-3431,_scaled_dot_product_flash_attention_backward_1,call_function,_scaled_dot_product_flash_attention_backward.default,backward,26,8,8,3,3280,2923,2
-3432,getitem_255,call_function,getitem,backward,26,1,1,1,3281,2896,2
-3433,getitem_256,call_function,getitem,backward,26,1,1,1,3281,2897,2
-3434,getitem_257,call_function,getitem,backward,26,1,1,1,3281,2890,2
-3435,permute_362,call_function,permute.default,backward,26,1,1,1,3282,2889,2
-3436,permute_363,call_function,permute.default,backward,26,1,1,1,3282,2896,2
-3437,permute_364,call_function,permute.default,backward,26,1,1,1,3282,2895,2
-3438,convert_element_type_754,call_function,convert_element_type.default,backward,26,1,1,1,3283,2895,2
-3439,convert_element_type_755,call_function,convert_element_type.default,backward,26,1,1,1,3283,2894,2
-3440,view_679,call_function,view.default,backward,26,1,1,1,3284,2894,2
-3441,view_as_complex_58,call_function,view_as_complex.default,backward,26,1,1,1,3285,2893,6
-3442,_conj_2,call_function,_conj.default,backward,26,1,1,1,4,2894,3
-3443,clone_14,call_function,clone.default,backward,26,1,1,1,5,2893,3
-3444,mul_236,call_function,mul.Tensor,backward,26,2,2,1,3288,2892,8
-3445,view_680,call_function,view.default,backward,26,1,1,1,3284,2893,2
-3446,view_as_complex_59,call_function,view_as_complex.default,backward,26,1,1,1,3285,2892,6
-3447,_conj_3,call_function,_conj.default,backward,26,1,1,1,4,2893,3
-3448,clone_15,call_function,clone.default,backward,26,1,1,1,5,2892,3
-3449,mul_237,call_function,mul.Tensor,backward,26,2,2,1,3288,2891,8
-3450,view_as_real_58,call_function,view_as_real.default,backward,26,1,1,1,3289,2891,6
-3451,view_681,call_function,view.default,backward,26,1,1,1,3290,2890,6
-3452,convert_element_type_756,call_function,convert_element_type.default,backward,26,1,1,1,3291,2889,6
-3453,view_as_real_59,call_function,view_as_real.default,backward,26,1,1,1,3289,2890,6
-3454,view_682,call_function,view.default,backward,26,1,1,1,3290,2889,6
-3455,convert_element_type_757,call_function,convert_element_type.default,backward,26,1,1,1,3291,2888,6
-3456,view_683,call_function,view.default,backward,26,1,1,1,3283,2888,2
-3457,view_684,call_function,view.default,backward,26,1,1,1,3292,2888,5
-3458,view_685,call_function,view.default,backward,26,1,1,1,3292,2887,5
-3459,alias_default_823,call_function,alias.default,backward,26,1,1,2,3284,2887,4
-3460,einsum_default_221,call_function,einsum.default,backward,26,2,2,1,3285,3,5
-3461,permute_367,call_function,permute.default,backward,26,1,1,1,4,2883,3
-3462,einsum_default_222,call_function,einsum.default,backward,26,2,2,1,3286,2882,5
-3463,permute_368,call_function,permute.default,backward,26,1,1,1,3286,2,4
-3464,dtype_cast_271,call_function,dtype_cast.default,backward,26,1,1,1,3287,1,4
-3465,alias_default_1483,call_function,alias.default,backward,26,1,1,0,3288,0,3
-3466,alias_default_824,call_function,alias.default,backward,26,1,1,2,3293,2887,4
-3467,einsum_default_223,call_function,einsum.default,backward,26,2,2,1,3294,3,5
-3468,permute_371,call_function,permute.default,backward,26,1,1,1,4,2883,3
-3469,einsum_default_224,call_function,einsum.default,backward,26,2,2,1,3295,2882,5
-3470,add_152,call_function,add.Tensor,unknown,,2,2,1,3302,2881,10
-3471,permute_372,call_function,permute.default,backward,26,1,1,1,3295,2,4
-3472,dtype_cast_272,call_function,dtype_cast.default,backward,26,1,1,1,3296,1,4
-3473,alias_default_1482,call_function,alias.default,backward,26,1,1,0,3297,0,3
-3474,alias_default_825,call_function,alias.default,backward,26,1,1,2,3293,2886,4
-3475,einsum_default_225,call_function,einsum.default,backward,26,2,2,1,3294,3,5
-3476,permute_375,call_function,permute.default,backward,26,1,1,1,4,2882,3
-3477,einsum_default_226,call_function,einsum.default,backward,26,2,2,1,3295,2881,5
-3478,add_153,call_function,add.Tensor,unknown,,2,2,1,3318,2880,10
-3479,permute_376,call_function,permute.default,backward,26,1,1,1,3295,2,4
-3480,dtype_cast_273,call_function,dtype_cast.default,backward,26,1,1,1,3296,1,4
-3481,alias_default_1481,call_function,alias.default,backward,26,1,1,0,3297,0,3
-3482,convert_element_type_770,call_function,convert_element_type.default,backward,26,1,1,1,3319,2879,8
-3483,convert_element_type_771,call_function,convert_element_type.default,backward,26,1,1,1,2869,2879,4
-3484,convert_element_type_772,call_function,convert_element_type.default,backward,26,1,1,1,3,2873,2
-3485,alias_default_826,call_function,alias.default,backward,26,1,1,2,3320,2878,4
-3486,mul_238,call_function,mul.Tensor,backward,26,2,2,1,3322,2872,8
-3487,mul_239,call_function,mul.Tensor,backward,26,2,2,1,2877,2878,8
-3488,alias_default_827,call_function,alias.default,backward,26,1,1,2,3323,2871,4
-3489,alias_default_828,call_function,alias.default,backward,26,1,1,3,2878,2877,4
-3490,mul_240,call_function,mul.Tensor,backward,26,2,2,1,3327,2870,8
-3491,sum_9,call_function,sum.dim_IntList,backward,26,1,1,1,3328,2869,5
-3492,div_32,call_function,div.Tensor,backward,26,1,1,1,2879,2869,6
-3493,mul_241,call_function,mul.Tensor,backward,26,2,2,1,3330,2868,8
-3494,sub_6,call_function,sub.Tensor,backward,26,2,2,1,3331,2867,10
-3495,mul_242,call_function,mul.Tensor,backward,26,2,2,1,3332,2866,8
-3496,mul_243,call_function,mul.Tensor,backward,26,2,2,1,3324,4,8
-3497,sum_10,call_function,sum.dim_IntList,backward,26,1,1,1,3325,3,5
-3498,convert_element_type_773,call_function,convert_element_type.default,backward,26,1,1,1,3333,2865,6
-3499,convert_element_type_774,call_function,convert_element_type.default,backward,26,1,1,1,3326,2,3
-3500,add_154,call_function,add.Tensor,unknown,,2,2,1,3334,2864,10
-3501,dtype_cast_274,call_function,dtype_cast.default,backward,26,1,1,1,3327,1,3
-3502,alias_default_1488,call_function,alias.default,backward,26,1,1,0,3328,0,2
-3503,alias_default_829,call_function,alias.default,unknown,,1,1,3,3335,2863,4
-3504,einsum_default_227,call_function,einsum.default,backward,25,2,2,1,3336,3,5
-3505,permute_379,call_function,permute.default,backward,25,1,1,1,4,2859,3
-3506,einsum_default_228,call_function,einsum.default,backward,25,2,2,1,3337,2858,5
-3507,permute_380,call_function,permute.default,backward,25,1,1,1,3337,2,4
-3508,dtype_cast_275,call_function,dtype_cast.default,backward,25,1,1,1,3338,1,4
-3509,alias_default_1477,call_function,alias.default,backward,25,1,1,0,3339,0,3
-3510,alias_default_830,call_function,alias.default,backward,25,1,1,2,3338,2857,4
-3511,mul_244,call_function,mul.Tensor,backward,25,2,2,1,3339,2845,8
-3512,mul_245,call_function,mul.Tensor,backward,25,2,2,1,3339,2849,8
-3513,alias_default_831,call_function,alias.default,backward,25,1,1,2,3340,2844,4
-3514,einsum_default_229,call_function,einsum.default,backward,25,2,2,1,3341,3,5
-3515,permute_383,call_function,permute.default,backward,25,1,1,1,4,2840,3
-3516,einsum_default_230,call_function,einsum.default,backward,25,2,2,1,3342,2839,5
-3517,permute_384,call_function,permute.default,backward,25,1,1,1,3342,2,4
-3518,dtype_cast_276,call_function,dtype_cast.default,backward,25,1,1,1,3343,1,4
-3519,alias_default_1478,call_function,alias.default,backward,25,1,1,0,3344,0,3
-3520,convert_element_type_783,call_function,convert_element_type.default,backward,25,1,1,1,3340,2848,6
-3521,convert_element_type_784,call_function,convert_element_type.default,backward,25,1,1,1,2846,2858,4
-3522,alias_default_832,call_function,alias.default,backward,25,1,1,2,2847,2857,4
-3523,neg_30,call_function,neg.default,backward,25,1,1,1,2848,2856,8
-3524,exp_30,call_function,exp.default,backward,25,1,1,1,2849,2855,6
-3525,add_155,call_function,add.Tensor,backward,25,1,1,1,2850,2854,4
-3526,reciprocal_2,call_function,reciprocal.default,backward,25,1,1,1,2851,2853,4
-3527,mul_246,call_function,mul.Tensor,backward,25,1,1,1,2852,2852,6
-3528,alias_default_833,call_function,alias.default,backward,25,1,1,2,2853,2851,4
-3529,mul_247,call_function,mul.Tensor,backward,25,2,2,1,3349,2847,8
-3530,sub_7,call_function,sub.Tensor,backward,25,1,1,1,2854,2849,4
-3531,mul_248,call_function,mul.Tensor,backward,25,2,2,1,2855,2848,8
-3532,add_156,call_function,add.Tensor,backward,25,1,1,1,2856,2847,4
-3533,mul_249,call_function,mul.Tensor,backward,25,2,2,1,3353,2846,8
-3534,convert_element_type_785,call_function,convert_element_type.default,backward,25,1,1,1,3354,2845,6
-3535,alias_default_834,call_function,alias.default,backward,25,1,1,2,3355,2844,4
-3536,einsum_default_231,call_function,einsum.default,backward,25,2,2,1,3356,3,5
-3537,permute_387,call_function,permute.default,backward,25,1,1,1,4,2840,3
-3538,einsum_default_232,call_function,einsum.default,backward,25,2,2,1,3357,2839,5
-3539,add_157,call_function,add.Tensor,unknown,,2,2,1,3362,2838,10
-3540,permute_388,call_function,permute.default,backward,25,1,1,1,3357,2,4
-3541,dtype_cast_277,call_function,dtype_cast.default,backward,25,1,1,1,3358,1,4
-3542,alias_default_1476,call_function,alias.default,backward,25,1,1,0,3359,0,3
-3543,convert_element_type_790,call_function,convert_element_type.default,backward,25,1,1,1,3363,2837,8
-3544,convert_element_type_791,call_function,convert_element_type.default,backward,25,1,1,1,2826,2837,4
-3545,convert_element_type_792,call_function,convert_element_type.default,backward,25,1,1,1,3,2831,2
-3546,alias_default_835,call_function,alias.default,backward,25,1,1,2,3364,2836,4
-3547,mul_250,call_function,mul.Tensor,backward,25,2,2,1,3366,2830,8
-3548,mul_251,call_function,mul.Tensor,backward,25,2,2,1,2834,2836,8
-3549,alias_default_836,call_function,alias.default,backward,25,1,1,2,3367,2829,4
-3550,alias_default_837,call_function,alias.default,backward,25,1,1,3,2835,2835,4
-3551,mul_252,call_function,mul.Tensor,backward,25,2,2,1,3371,2828,8
-3552,sum_11,call_function,sum.dim_IntList,backward,25,1,1,1,3372,2827,5
-3553,div_33,call_function,div.Tensor,backward,25,1,1,1,2836,2827,6
-3554,mul_253,call_function,mul.Tensor,backward,25,2,2,1,3374,2826,8
-3555,sub_8,call_function,sub.Tensor,backward,25,2,2,1,3375,2825,10
-3556,mul_254,call_function,mul.Tensor,backward,25,2,2,1,3376,2824,8
-3557,mul_255,call_function,mul.Tensor,backward,25,2,2,1,3368,4,8
-3558,sum_12,call_function,sum.dim_IntList,backward,25,1,1,1,3369,3,5
-3559,convert_element_type_793,call_function,convert_element_type.default,backward,25,1,1,1,3377,2823,6
-3560,convert_element_type_794,call_function,convert_element_type.default,backward,25,1,1,1,3370,2,3
-3561,add_158,call_function,add.Tensor,unknown,,2,2,1,3378,2822,10
-3562,dtype_cast_278,call_function,dtype_cast.default,backward,25,1,1,1,3371,1,3
-3563,alias_default_1480,call_function,alias.default,backward,25,1,1,0,3372,0,2
-3564,alias_default_838,call_function,alias.default,unknown,,1,1,3,3379,2821,4
-3565,einsum_default_233,call_function,einsum.default,backward,25,2,2,1,3380,3,5
-3566,permute_391,call_function,permute.default,backward,25,1,1,1,4,2817,3
-3567,einsum_default_234,call_function,einsum.default,backward,25,2,2,1,3381,2816,5
-3568,permute_392,call_function,permute.default,backward,25,1,1,1,3381,2,4
-3569,dtype_cast_279,call_function,dtype_cast.default,backward,25,1,1,1,3382,1,4
-3570,alias_default_1475,call_function,alias.default,backward,25,1,1,0,3383,0,3
-3571,view_700,call_function,view.default,backward,25,1,1,1,3382,2815,4
-3572,permute_393,call_function,permute.default,backward,25,1,1,1,3383,2814,4
-3573,_scaled_dot_product_flash_attention_backward_2,call_function,_scaled_dot_product_flash_attention_backward.default,backward,25,8,8,3,3387,2813,2
-3574,getitem_258,call_function,getitem,backward,25,1,1,1,3388,2786,2
-3575,getitem_259,call_function,getitem,backward,25,1,1,1,3388,2787,2
-3576,getitem_260,call_function,getitem,backward,25,1,1,1,3388,2780,2
-3577,permute_394,call_function,permute.default,backward,25,1,1,1,3389,2779,2
-3578,permute_395,call_function,permute.default,backward,25,1,1,1,3389,2786,2
-3579,permute_396,call_function,permute.default,backward,25,1,1,1,3389,2785,2
-3580,convert_element_type_799,call_function,convert_element_type.default,backward,25,1,1,1,3390,2785,2
-3581,convert_element_type_800,call_function,convert_element_type.default,backward,25,1,1,1,3390,2784,2
-3582,view_701,call_function,view.default,backward,25,1,1,1,3391,2784,2
-3583,view_as_complex_60,call_function,view_as_complex.default,backward,25,1,1,1,3392,2783,6
-3584,_conj_4,call_function,_conj.default,backward,25,1,1,1,4,2784,3
-3585,clone_22,call_function,clone.default,backward,25,1,1,1,5,2783,3
-3586,mul_256,call_function,mul.Tensor,backward,25,2,2,1,3395,2782,8
-3587,view_702,call_function,view.default,backward,25,1,1,1,3391,2783,2
-3588,view_as_complex_61,call_function,view_as_complex.default,backward,25,1,1,1,3392,2782,6
-3589,_conj_5,call_function,_conj.default,backward,25,1,1,1,4,2783,3
-3590,clone_23,call_function,clone.default,backward,25,1,1,1,5,2782,3
-3591,mul_257,call_function,mul.Tensor,backward,25,2,2,1,3395,2781,8
-3592,view_as_real_60,call_function,view_as_real.default,backward,25,1,1,1,3396,2781,6
-3593,view_703,call_function,view.default,backward,25,1,1,1,3397,2780,6
-3594,convert_element_type_801,call_function,convert_element_type.default,backward,25,1,1,1,3398,2779,6
-3595,view_as_real_61,call_function,view_as_real.default,backward,25,1,1,1,3396,2780,6
-3596,view_704,call_function,view.default,backward,25,1,1,1,3397,2779,6
-3597,convert_element_type_802,call_function,convert_element_type.default,backward,25,1,1,1,3398,2778,6
-3598,view_705,call_function,view.default,backward,25,1,1,1,3390,2778,2
-3599,view_706,call_function,view.default,backward,25,1,1,1,3399,2778,5
-3600,view_707,call_function,view.default,backward,25,1,1,1,3399,2777,5
-3601,alias_default_839,call_function,alias.default,backward,25,1,1,2,3391,2777,4
-3602,einsum_default_235,call_function,einsum.default,backward,25,2,2,1,3392,3,5
-3603,permute_399,call_function,permute.default,backward,25,1,1,1,4,2773,3
-3604,einsum_default_236,call_function,einsum.default,backward,25,2,2,1,3393,2772,5
-3605,permute_400,call_function,permute.default,backward,25,1,1,1,3393,2,4
-3606,dtype_cast_280,call_function,dtype_cast.default,backward,25,1,1,1,3394,1,4
-3607,alias_default_1474,call_function,alias.default,backward,25,1,1,0,3395,0,3
-3608,alias_default_840,call_function,alias.default,backward,25,1,1,2,3400,2777,4
-3609,einsum_default_237,call_function,einsum.default,backward,25,2,2,1,3401,3,5
-3610,permute_403,call_function,permute.default,backward,25,1,1,1,4,2773,3
-3611,einsum_default_238,call_function,einsum.default,backward,25,2,2,1,3402,2772,5
-3612,add_159,call_function,add.Tensor,unknown,,2,2,1,3409,2771,10
-3613,permute_404,call_function,permute.default,backward,25,1,1,1,3402,2,4
-3614,dtype_cast_281,call_function,dtype_cast.default,backward,25,1,1,1,3403,1,4
-3615,alias_default_1473,call_function,alias.default,backward,25,1,1,0,3404,0,3
-3616,alias_default_841,call_function,alias.default,backward,25,1,1,2,3400,2776,4
-3617,einsum_default_239,call_function,einsum.default,backward,25,2,2,1,3401,3,5
-3618,permute_407,call_function,permute.default,backward,25,1,1,1,4,2772,3
-3619,einsum_default_240,call_function,einsum.default,backward,25,2,2,1,3402,2771,5
-3620,add_160,call_function,add.Tensor,unknown,,2,2,1,3425,2770,10
-3621,permute_408,call_function,permute.default,backward,25,1,1,1,3402,2,4
-3622,dtype_cast_282,call_function,dtype_cast.default,backward,25,1,1,1,3403,1,4
-3623,alias_default_1472,call_function,alias.default,backward,25,1,1,0,3404,0,3
-3624,convert_element_type_815,call_function,convert_element_type.default,backward,25,1,1,1,3426,2769,8
-3625,convert_element_type_816,call_function,convert_element_type.default,backward,25,1,1,1,2759,2769,4
-3626,convert_element_type_817,call_function,convert_element_type.default,backward,25,1,1,1,3,2763,2
-3627,alias_default_842,call_function,alias.default,backward,25,1,1,2,3427,2768,4
-3628,mul_258,call_function,mul.Tensor,backward,25,2,2,1,3429,2762,8
-3629,mul_259,call_function,mul.Tensor,backward,25,2,2,1,2767,2768,8
-3630,alias_default_843,call_function,alias.default,backward,25,1,1,2,3430,2761,4
-3631,alias_default_844,call_function,alias.default,backward,25,1,1,3,2768,2767,4
-3632,mul_260,call_function,mul.Tensor,backward,25,2,2,1,3434,2760,8
-3633,sum_13,call_function,sum.dim_IntList,backward,25,1,1,1,3435,2759,5
-3634,div_34,call_function,div.Tensor,backward,25,1,1,1,2769,2759,6
-3635,mul_261,call_function,mul.Tensor,backward,25,2,2,1,3437,2758,8
-3636,sub_9,call_function,sub.Tensor,backward,25,2,2,1,3438,2757,10
-3637,mul_262,call_function,mul.Tensor,backward,25,2,2,1,3439,2756,8
-3638,mul_263,call_function,mul.Tensor,backward,25,2,2,1,3431,4,8
-3639,sum_14,call_function,sum.dim_IntList,backward,25,1,1,1,3432,3,5
-3640,convert_element_type_818,call_function,convert_element_type.default,backward,25,1,1,1,3440,2755,6
-3641,convert_element_type_819,call_function,convert_element_type.default,backward,25,1,1,1,3433,2,3
-3642,add_161,call_function,add.Tensor,unknown,,2,2,1,3441,2754,10
-3643,dtype_cast_283,call_function,dtype_cast.default,backward,25,1,1,1,3434,1,3
-3644,alias_default_1479,call_function,alias.default,backward,25,1,1,0,3435,0,2
-3645,alias_default_845,call_function,alias.default,unknown,,1,1,3,3442,2753,4
-3646,einsum_default_241,call_function,einsum.default,backward,24,2,2,1,3443,3,5
-3647,permute_411,call_function,permute.default,backward,24,1,1,1,4,2749,3
-3648,einsum_default_242,call_function,einsum.default,backward,24,2,2,1,3444,2748,5
-3649,permute_412,call_function,permute.default,backward,24,1,1,1,3444,2,4
-3650,dtype_cast_284,call_function,dtype_cast.default,backward,24,1,1,1,3445,1,4
-3651,alias_default_1468,call_function,alias.default,backward,24,1,1,0,3446,0,3
-3652,alias_default_846,call_function,alias.default,backward,24,1,1,2,3445,2747,4
-3653,mul_264,call_function,mul.Tensor,backward,24,2,2,1,3446,2735,8
-3654,mul_265,call_function,mul.Tensor,backward,24,2,2,1,3446,2739,8
-3655,alias_default_847,call_function,alias.default,backward,24,1,1,2,3447,2734,4
-3656,einsum_default_243,call_function,einsum.default,backward,24,2,2,1,3448,3,5
-3657,permute_415,call_function,permute.default,backward,24,1,1,1,4,2730,3
-3658,einsum_default_244,call_function,einsum.default,backward,24,2,2,1,3449,2729,5
-3659,permute_416,call_function,permute.default,backward,24,1,1,1,3449,2,4
-3660,dtype_cast_285,call_function,dtype_cast.default,backward,24,1,1,1,3450,1,4
-3661,alias_default_1469,call_function,alias.default,backward,24,1,1,0,3451,0,3
-3662,convert_element_type_828,call_function,convert_element_type.default,backward,24,1,1,1,3447,2738,6
-3663,convert_element_type_829,call_function,convert_element_type.default,backward,24,1,1,1,2736,2748,4
-3664,alias_default_848,call_function,alias.default,backward,24,1,1,2,2737,2747,4
-3665,neg_31,call_function,neg.default,backward,24,1,1,1,2738,2746,8
-3666,exp_31,call_function,exp.default,backward,24,1,1,1,2739,2745,6
-3667,add_162,call_function,add.Tensor,backward,24,1,1,1,2740,2744,4
-3668,reciprocal_3,call_function,reciprocal.default,backward,24,1,1,1,2741,2743,4
-3669,mul_266,call_function,mul.Tensor,backward,24,1,1,1,2742,2742,6
-3670,alias_default_849,call_function,alias.default,backward,24,1,1,2,2743,2741,4
-3671,mul_267,call_function,mul.Tensor,backward,24,2,2,1,3456,2737,8
-3672,sub_10,call_function,sub.Tensor,backward,24,1,1,1,2744,2739,4
-3673,mul_268,call_function,mul.Tensor,backward,24,2,2,1,2745,2738,8
-3674,add_163,call_function,add.Tensor,backward,24,1,1,1,2746,2737,4
-3675,mul_269,call_function,mul.Tensor,backward,24,2,2,1,3460,2736,8
-3676,convert_element_type_830,call_function,convert_element_type.default,backward,24,1,1,1,3461,2735,6
-3677,alias_default_850,call_function,alias.default,backward,24,1,1,2,3462,2734,4
-3678,einsum_default_245,call_function,einsum.default,backward,24,2,2,1,3463,3,5
-3679,permute_419,call_function,permute.default,backward,24,1,1,1,4,2730,3
-3680,einsum_default_246,call_function,einsum.default,backward,24,2,2,1,3464,2729,5
-3681,add_164,call_function,add.Tensor,unknown,,2,2,1,3469,2728,10
-3682,permute_420,call_function,permute.default,backward,24,1,1,1,3464,2,4
-3683,dtype_cast_286,call_function,dtype_cast.default,backward,24,1,1,1,3465,1,4
-3684,alias_default_1467,call_function,alias.default,backward,24,1,1,0,3466,0,3
-3685,convert_element_type_835,call_function,convert_element_type.default,backward,24,1,1,1,3470,2727,8
-3686,convert_element_type_836,call_function,convert_element_type.default,backward,24,1,1,1,2716,2727,4
-3687,convert_element_type_837,call_function,convert_element_type.default,backward,24,1,1,1,3,2721,2
-3688,alias_default_851,call_function,alias.default,backward,24,1,1,2,3471,2726,4
-3689,mul_270,call_function,mul.Tensor,backward,24,2,2,1,3473,2720,8
-3690,mul_271,call_function,mul.Tensor,backward,24,2,2,1,2724,2726,8
-3691,alias_default_852,call_function,alias.default,backward,24,1,1,2,3474,2719,4
-3692,alias_default_853,call_function,alias.default,backward,24,1,1,3,2725,2725,4
-3693,mul_272,call_function,mul.Tensor,backward,24,2,2,1,3478,2718,8
-3694,sum_15,call_function,sum.dim_IntList,backward,24,1,1,1,3479,2717,5
-3695,div_35,call_function,div.Tensor,backward,24,1,1,1,2726,2717,6
-3696,mul_273,call_function,mul.Tensor,backward,24,2,2,1,3481,2716,8
-3697,sub_11,call_function,sub.Tensor,backward,24,2,2,1,3482,2715,10
-3698,mul_274,call_function,mul.Tensor,backward,24,2,2,1,3483,2714,8
-3699,mul_275,call_function,mul.Tensor,backward,24,2,2,1,3475,4,8
-3700,sum_16,call_function,sum.dim_IntList,backward,24,1,1,1,3476,3,5
-3701,convert_element_type_838,call_function,convert_element_type.default,backward,24,1,1,1,3484,2713,6
-3702,convert_element_type_839,call_function,convert_element_type.default,backward,24,1,1,1,3477,2,3
-3703,add_165,call_function,add.Tensor,unknown,,2,2,1,3485,2712,10
-3704,dtype_cast_287,call_function,dtype_cast.default,backward,24,1,1,1,3478,1,3
-3705,alias_default_1471,call_function,alias.default,backward,24,1,1,0,3479,0,2
-3706,alias_default_854,call_function,alias.default,unknown,,1,1,3,3486,2711,4
-3707,einsum_default_247,call_function,einsum.default,backward,24,2,2,1,3487,3,5
-3708,permute_423,call_function,permute.default,backward,24,1,1,1,4,2707,3
-3709,einsum_default_248,call_function,einsum.default,backward,24,2,2,1,3488,2706,5
-3710,permute_424,call_function,permute.default,backward,24,1,1,1,3488,2,4
-3711,dtype_cast_288,call_function,dtype_cast.default,backward,24,1,1,1,3489,1,4
-3712,alias_default_1466,call_function,alias.default,backward,24,1,1,0,3490,0,3
-3713,view_722,call_function,view.default,backward,24,1,1,1,3489,2705,4
-3714,permute_425,call_function,permute.default,backward,24,1,1,1,3490,2704,4
-3715,_scaled_dot_product_flash_attention_backward_3,call_function,_scaled_dot_product_flash_attention_backward.default,backward,24,8,8,3,3494,2703,2
-3716,getitem_261,call_function,getitem,backward,24,1,1,1,3495,2676,2
-3717,getitem_262,call_function,getitem,backward,24,1,1,1,3495,2677,2
-3718,getitem_263,call_function,getitem,backward,24,1,1,1,3495,2670,2
-3719,permute_426,call_function,permute.default,backward,24,1,1,1,3496,2669,2
-3720,permute_427,call_function,permute.default,backward,24,1,1,1,3496,2676,2
-3721,permute_428,call_function,permute.default,backward,24,1,1,1,3496,2675,2
-3722,convert_element_type_844,call_function,convert_element_type.default,backward,24,1,1,1,3497,2675,2
-3723,convert_element_type_845,call_function,convert_element_type.default,backward,24,1,1,1,3497,2674,2
-3724,view_723,call_function,view.default,backward,24,1,1,1,3498,2674,2
-3725,view_as_complex_62,call_function,view_as_complex.default,backward,24,1,1,1,3499,2673,6
-3726,_conj_6,call_function,_conj.default,backward,24,1,1,1,4,2674,3
-3727,clone_30,call_function,clone.default,backward,24,1,1,1,5,2673,3
-3728,mul_276,call_function,mul.Tensor,backward,24,2,2,1,3502,2672,8
-3729,view_724,call_function,view.default,backward,24,1,1,1,3498,2673,2
-3730,view_as_complex_63,call_function,view_as_complex.default,backward,24,1,1,1,3499,2672,6
-3731,_conj_7,call_function,_conj.default,backward,24,1,1,1,4,2673,3
-3732,clone_31,call_function,clone.default,backward,24,1,1,1,5,2672,3
-3733,mul_277,call_function,mul.Tensor,backward,24,2,2,1,3502,2671,8
-3734,view_as_real_62,call_function,view_as_real.default,backward,24,1,1,1,3503,2671,6
-3735,view_725,call_function,view.default,backward,24,1,1,1,3504,2670,6
-3736,convert_element_type_846,call_function,convert_element_type.default,backward,24,1,1,1,3505,2669,6
-3737,view_as_real_63,call_function,view_as_real.default,backward,24,1,1,1,3503,2670,6
-3738,view_726,call_function,view.default,backward,24,1,1,1,3504,2669,6
-3739,convert_element_type_847,call_function,convert_element_type.default,backward,24,1,1,1,3505,2668,6
-3740,view_727,call_function,view.default,backward,24,1,1,1,3497,2668,2
-3741,view_728,call_function,view.default,backward,24,1,1,1,3506,2668,5
-3742,view_729,call_function,view.default,backward,24,1,1,1,3506,2667,5
-3743,alias_default_855,call_function,alias.default,backward,24,1,1,2,3498,2667,4
-3744,einsum_default_249,call_function,einsum.default,backward,24,2,2,1,3499,3,5
-3745,permute_431,call_function,permute.default,backward,24,1,1,1,4,2663,3
-3746,einsum_default_250,call_function,einsum.default,backward,24,2,2,1,3500,2662,5
-3747,permute_432,call_function,permute.default,backward,24,1,1,1,3500,2,4
-3748,dtype_cast_289,call_function,dtype_cast.default,backward,24,1,1,1,3501,1,4
-3749,alias_default_1465,call_function,alias.default,backward,24,1,1,0,3502,0,3
-3750,alias_default_856,call_function,alias.default,backward,24,1,1,2,3507,2667,4
-3751,einsum_default_251,call_function,einsum.default,backward,24,2,2,1,3508,3,5
-3752,permute_435,call_function,permute.default,backward,24,1,1,1,4,2663,3
-3753,einsum_default_252,call_function,einsum.default,backward,24,2,2,1,3509,2662,5
-3754,add_166,call_function,add.Tensor,unknown,,2,2,1,3516,2661,10
-3755,permute_436,call_function,permute.default,backward,24,1,1,1,3509,2,4
-3756,dtype_cast_290,call_function,dtype_cast.default,backward,24,1,1,1,3510,1,4
-3757,alias_default_1464,call_function,alias.default,backward,24,1,1,0,3511,0,3
-3758,alias_default_857,call_function,alias.default,backward,24,1,1,2,3507,2666,4
-3759,einsum_default_253,call_function,einsum.default,backward,24,2,2,1,3508,3,5
-3760,permute_439,call_function,permute.default,backward,24,1,1,1,4,2662,3
-3761,einsum_default_254,call_function,einsum.default,backward,24,2,2,1,3509,2661,5
-3762,add_167,call_function,add.Tensor,unknown,,2,2,1,3532,2660,10
-3763,permute_440,call_function,permute.default,backward,24,1,1,1,3509,2,4
-3764,dtype_cast_291,call_function,dtype_cast.default,backward,24,1,1,1,3510,1,4
-3765,alias_default_1463,call_function,alias.default,backward,24,1,1,0,3511,0,3
-3766,convert_element_type_860,call_function,convert_element_type.default,backward,24,1,1,1,3533,2659,8
-3767,convert_element_type_861,call_function,convert_element_type.default,backward,24,1,1,1,2649,2659,4
-3768,convert_element_type_862,call_function,convert_element_type.default,backward,24,1,1,1,3,2653,2
-3769,alias_default_858,call_function,alias.default,backward,24,1,1,2,3534,2658,4
-3770,mul_278,call_function,mul.Tensor,backward,24,2,2,1,3536,2652,8
-3771,mul_279,call_function,mul.Tensor,backward,24,2,2,1,2657,2658,8
-3772,alias_default_859,call_function,alias.default,backward,24,1,1,2,3537,2651,4
-3773,alias_default_860,call_function,alias.default,backward,24,1,1,3,2658,2657,4
-3774,mul_280,call_function,mul.Tensor,backward,24,2,2,1,3541,2650,8
-3775,sum_17,call_function,sum.dim_IntList,backward,24,1,1,1,3542,2649,5
-3776,div_36,call_function,div.Tensor,backward,24,1,1,1,2659,2649,6
-3777,mul_281,call_function,mul.Tensor,backward,24,2,2,1,3544,2648,8
-3778,sub_12,call_function,sub.Tensor,backward,24,2,2,1,3545,2647,10
-3779,mul_282,call_function,mul.Tensor,backward,24,2,2,1,3546,2646,8
-3780,mul_283,call_function,mul.Tensor,backward,24,2,2,1,3538,4,8
-3781,sum_18,call_function,sum.dim_IntList,backward,24,1,1,1,3539,3,5
-3782,convert_element_type_863,call_function,convert_element_type.default,backward,24,1,1,1,3547,2645,6
-3783,convert_element_type_864,call_function,convert_element_type.default,backward,24,1,1,1,3540,2,3
-3784,add_168,call_function,add.Tensor,unknown,,2,2,1,3548,2644,10
-3785,dtype_cast_292,call_function,dtype_cast.default,backward,24,1,1,1,3541,1,3
-3786,alias_default_1470,call_function,alias.default,backward,24,1,1,0,3542,0,2
-3787,alias_default_861,call_function,alias.default,unknown,,1,1,3,3549,2643,4
-3788,einsum_default_255,call_function,einsum.default,backward,23,2,2,1,3550,3,5
-3789,permute_443,call_function,permute.default,backward,23,1,1,1,4,2639,3
-3790,einsum_default_256,call_function,einsum.default,backward,23,2,2,1,3551,2638,5
-3791,permute_444,call_function,permute.default,backward,23,1,1,1,3551,2,4
-3792,dtype_cast_293,call_function,dtype_cast.default,backward,23,1,1,1,3552,1,4
-3793,alias_default_1459,call_function,alias.default,backward,23,1,1,0,3553,0,3
-3794,alias_default_862,call_function,alias.default,backward,23,1,1,2,3552,2637,4
-3795,mul_284,call_function,mul.Tensor,backward,23,2,2,1,3553,2625,8
-3796,mul_285,call_function,mul.Tensor,backward,23,2,2,1,3553,2629,8
-3797,alias_default_863,call_function,alias.default,backward,23,1,1,2,3554,2624,4
-3798,einsum_default_257,call_function,einsum.default,backward,23,2,2,1,3555,3,5
-3799,permute_447,call_function,permute.default,backward,23,1,1,1,4,2620,3
-3800,einsum_default_258,call_function,einsum.default,backward,23,2,2,1,3556,2619,5
-3801,permute_448,call_function,permute.default,backward,23,1,1,1,3556,2,4
-3802,dtype_cast_294,call_function,dtype_cast.default,backward,23,1,1,1,3557,1,4
-3803,alias_default_1460,call_function,alias.default,backward,23,1,1,0,3558,0,3
-3804,convert_element_type_873,call_function,convert_element_type.default,backward,23,1,1,1,3554,2628,6
-3805,convert_element_type_874,call_function,convert_element_type.default,backward,23,1,1,1,2626,2638,4
-3806,alias_default_864,call_function,alias.default,backward,23,1,1,2,2627,2637,4
-3807,neg_32,call_function,neg.default,backward,23,1,1,1,2628,2636,8
-3808,exp_32,call_function,exp.default,backward,23,1,1,1,2629,2635,6
-3809,add_169,call_function,add.Tensor,backward,23,1,1,1,2630,2634,4
-3810,reciprocal_4,call_function,reciprocal.default,backward,23,1,1,1,2631,2633,4
-3811,mul_286,call_function,mul.Tensor,backward,23,1,1,1,2632,2632,6
-3812,alias_default_865,call_function,alias.default,backward,23,1,1,2,2633,2631,4
-3813,mul_287,call_function,mul.Tensor,backward,23,2,2,1,3563,2627,8
-3814,sub_13,call_function,sub.Tensor,backward,23,1,1,1,2634,2629,4
-3815,mul_288,call_function,mul.Tensor,backward,23,2,2,1,2635,2628,8
-3816,add_170,call_function,add.Tensor,backward,23,1,1,1,2636,2627,4
-3817,mul_289,call_function,mul.Tensor,backward,23,2,2,1,3567,2626,8
-3818,convert_element_type_875,call_function,convert_element_type.default,backward,23,1,1,1,3568,2625,6
-3819,alias_default_866,call_function,alias.default,backward,23,1,1,2,3569,2624,4
-3820,einsum_default_259,call_function,einsum.default,backward,23,2,2,1,3570,3,5
-3821,permute_451,call_function,permute.default,backward,23,1,1,1,4,2620,3
-3822,einsum_default_260,call_function,einsum.default,backward,23,2,2,1,3571,2619,5
-3823,add_171,call_function,add.Tensor,unknown,,2,2,1,3576,2618,10
-3824,permute_452,call_function,permute.default,backward,23,1,1,1,3571,2,4
-3825,dtype_cast_295,call_function,dtype_cast.default,backward,23,1,1,1,3572,1,4
-3826,alias_default_1458,call_function,alias.default,backward,23,1,1,0,3573,0,3
-3827,convert_element_type_880,call_function,convert_element_type.default,backward,23,1,1,1,3577,2617,8
-3828,convert_element_type_881,call_function,convert_element_type.default,backward,23,1,1,1,2606,2617,4
-3829,convert_element_type_882,call_function,convert_element_type.default,backward,23,1,1,1,3,2611,2
-3830,alias_default_867,call_function,alias.default,backward,23,1,1,2,3578,2616,4
-3831,mul_290,call_function,mul.Tensor,backward,23,2,2,1,3580,2610,8
-3832,mul_291,call_function,mul.Tensor,backward,23,2,2,1,2614,2616,8
-3833,alias_default_868,call_function,alias.default,backward,23,1,1,2,3581,2609,4
-3834,alias_default_869,call_function,alias.default,backward,23,1,1,3,2615,2615,4
-3835,mul_292,call_function,mul.Tensor,backward,23,2,2,1,3585,2608,8
-3836,sum_19,call_function,sum.dim_IntList,backward,23,1,1,1,3586,2607,5
-3837,div_37,call_function,div.Tensor,backward,23,1,1,1,2616,2607,6
-3838,mul_293,call_function,mul.Tensor,backward,23,2,2,1,3588,2606,8
-3839,sub_14,call_function,sub.Tensor,backward,23,2,2,1,3589,2605,10
-3840,mul_294,call_function,mul.Tensor,backward,23,2,2,1,3590,2604,8
-3841,mul_295,call_function,mul.Tensor,backward,23,2,2,1,3582,4,8
-3842,sum_20,call_function,sum.dim_IntList,backward,23,1,1,1,3583,3,5
-3843,convert_element_type_883,call_function,convert_element_type.default,backward,23,1,1,1,3591,2603,6
-3844,convert_element_type_884,call_function,convert_element_type.default,backward,23,1,1,1,3584,2,3
-3845,add_172,call_function,add.Tensor,unknown,,2,2,1,3592,2602,10
-3846,dtype_cast_296,call_function,dtype_cast.default,backward,23,1,1,1,3585,1,3
-3847,alias_default_1462,call_function,alias.default,backward,23,1,1,0,3586,0,2
-3848,alias_default_870,call_function,alias.default,unknown,,1,1,3,3593,2601,4
-3849,einsum_default_261,call_function,einsum.default,backward,23,2,2,1,3594,3,5
-3850,permute_455,call_function,permute.default,backward,23,1,1,1,4,2597,3
-3851,einsum_default_262,call_function,einsum.default,backward,23,2,2,1,3595,2596,5
-3852,permute_456,call_function,permute.default,backward,23,1,1,1,3595,2,4
-3853,dtype_cast_297,call_function,dtype_cast.default,backward,23,1,1,1,3596,1,4
-3854,alias_default_1457,call_function,alias.default,backward,23,1,1,0,3597,0,3
-3855,view_744,call_function,view.default,backward,23,1,1,1,3596,2595,4
-3856,permute_457,call_function,permute.default,backward,23,1,1,1,3597,2594,4
-3857,_scaled_dot_product_flash_attention_backward_4,call_function,_scaled_dot_product_flash_attention_backward.default,backward,23,8,8,3,3601,2593,2
-3858,getitem_264,call_function,getitem,backward,23,1,1,1,3602,2566,2
-3859,getitem_265,call_function,getitem,backward,23,1,1,1,3602,2567,2
-3860,getitem_266,call_function,getitem,backward,23,1,1,1,3602,2560,2
-3861,permute_458,call_function,permute.default,backward,23,1,1,1,3603,2559,2
-3862,permute_459,call_function,permute.default,backward,23,1,1,1,3603,2566,2
-3863,permute_460,call_function,permute.default,backward,23,1,1,1,3603,2565,2
-3864,convert_element_type_889,call_function,convert_element_type.default,backward,23,1,1,1,3604,2565,2
-3865,convert_element_type_890,call_function,convert_element_type.default,backward,23,1,1,1,3604,2564,2
-3866,view_745,call_function,view.default,backward,23,1,1,1,3605,2564,2
-3867,view_as_complex_64,call_function,view_as_complex.default,backward,23,1,1,1,3606,2563,6
-3868,_conj_8,call_function,_conj.default,backward,23,1,1,1,4,2564,3
-3869,clone_38,call_function,clone.default,backward,23,1,1,1,5,2563,3
-3870,mul_296,call_function,mul.Tensor,backward,23,2,2,1,3609,2562,8
-3871,view_746,call_function,view.default,backward,23,1,1,1,3605,2563,2
-3872,view_as_complex_65,call_function,view_as_complex.default,backward,23,1,1,1,3606,2562,6
-3873,_conj_9,call_function,_conj.default,backward,23,1,1,1,4,2563,3
-3874,clone_39,call_function,clone.default,backward,23,1,1,1,5,2562,3
-3875,mul_297,call_function,mul.Tensor,backward,23,2,2,1,3609,2561,8
-3876,view_as_real_64,call_function,view_as_real.default,backward,23,1,1,1,3610,2561,6
-3877,view_747,call_function,view.default,backward,23,1,1,1,3611,2560,6
-3878,convert_element_type_891,call_function,convert_element_type.default,backward,23,1,1,1,3612,2559,6
-3879,view_as_real_65,call_function,view_as_real.default,backward,23,1,1,1,3610,2560,6
-3880,view_748,call_function,view.default,backward,23,1,1,1,3611,2559,6
-3881,convert_element_type_892,call_function,convert_element_type.default,backward,23,1,1,1,3612,2558,6
-3882,view_749,call_function,view.default,backward,23,1,1,1,3604,2558,2
-3883,view_750,call_function,view.default,backward,23,1,1,1,3613,2558,5
-3884,view_751,call_function,view.default,backward,23,1,1,1,3613,2557,5
-3885,alias_default_871,call_function,alias.default,backward,23,1,1,2,3605,2557,4
-3886,einsum_default_263,call_function,einsum.default,backward,23,2,2,1,3606,3,5
-3887,permute_463,call_function,permute.default,backward,23,1,1,1,4,2553,3
-3888,einsum_default_264,call_function,einsum.default,backward,23,2,2,1,3607,2552,5
-3889,permute_464,call_function,permute.default,backward,23,1,1,1,3607,2,4
-3890,dtype_cast_298,call_function,dtype_cast.default,backward,23,1,1,1,3608,1,4
-3891,alias_default_1456,call_function,alias.default,backward,23,1,1,0,3609,0,3
-3892,alias_default_872,call_function,alias.default,backward,23,1,1,2,3614,2557,4
-3893,einsum_default_265,call_function,einsum.default,backward,23,2,2,1,3615,3,5
-3894,permute_467,call_function,permute.default,backward,23,1,1,1,4,2553,3
-3895,einsum_default_266,call_function,einsum.default,backward,23,2,2,1,3616,2552,5
-3896,add_173,call_function,add.Tensor,unknown,,2,2,1,3623,2551,10
-3897,permute_468,call_function,permute.default,backward,23,1,1,1,3616,2,4
-3898,dtype_cast_299,call_function,dtype_cast.default,backward,23,1,1,1,3617,1,4
-3899,alias_default_1455,call_function,alias.default,backward,23,1,1,0,3618,0,3
-3900,alias_default_873,call_function,alias.default,backward,23,1,1,2,3614,2556,4
-3901,einsum_default_267,call_function,einsum.default,backward,23,2,2,1,3615,3,5
-3902,permute_471,call_function,permute.default,backward,23,1,1,1,4,2552,3
-3903,einsum_default_268,call_function,einsum.default,backward,23,2,2,1,3616,2551,5
-3904,add_174,call_function,add.Tensor,unknown,,2,2,1,3639,2550,10
-3905,permute_472,call_function,permute.default,backward,23,1,1,1,3616,2,4
-3906,dtype_cast_300,call_function,dtype_cast.default,backward,23,1,1,1,3617,1,4
-3907,alias_default_1454,call_function,alias.default,backward,23,1,1,0,3618,0,3
-3908,convert_element_type_905,call_function,convert_element_type.default,backward,23,1,1,1,3640,2549,8
-3909,convert_element_type_906,call_function,convert_element_type.default,backward,23,1,1,1,2539,2549,4
-3910,convert_element_type_907,call_function,convert_element_type.default,backward,23,1,1,1,3,2543,2
-3911,alias_default_874,call_function,alias.default,backward,23,1,1,2,3641,2548,4
-3912,mul_298,call_function,mul.Tensor,backward,23,2,2,1,3643,2542,8
-3913,mul_299,call_function,mul.Tensor,backward,23,2,2,1,2547,2548,8
-3914,alias_default_875,call_function,alias.default,backward,23,1,1,2,3644,2541,4
-3915,alias_default_876,call_function,alias.default,backward,23,1,1,3,2548,2547,4
-3916,mul_300,call_function,mul.Tensor,backward,23,2,2,1,3648,2540,8
-3917,sum_21,call_function,sum.dim_IntList,backward,23,1,1,1,3649,2539,5
-3918,div_38,call_function,div.Tensor,backward,23,1,1,1,2549,2539,6
-3919,mul_301,call_function,mul.Tensor,backward,23,2,2,1,3651,2538,8
-3920,sub_15,call_function,sub.Tensor,backward,23,2,2,1,3652,2537,10
-3921,mul_302,call_function,mul.Tensor,backward,23,2,2,1,3653,2536,8
-3922,mul_303,call_function,mul.Tensor,backward,23,2,2,1,3645,4,8
-3923,sum_22,call_function,sum.dim_IntList,backward,23,1,1,1,3646,3,5
-3924,convert_element_type_908,call_function,convert_element_type.default,backward,23,1,1,1,3654,2535,6
-3925,convert_element_type_909,call_function,convert_element_type.default,backward,23,1,1,1,3647,2,3
-3926,add_175,call_function,add.Tensor,unknown,,2,2,1,3655,2534,10
-3927,dtype_cast_301,call_function,dtype_cast.default,backward,23,1,1,1,3648,1,3
-3928,alias_default_1461,call_function,alias.default,backward,23,1,1,0,3649,0,2
-3929,alias_default_877,call_function,alias.default,unknown,,1,1,3,3656,2533,4
-3930,einsum_default_269,call_function,einsum.default,backward,22,2,2,1,3657,3,5
-3931,permute_475,call_function,permute.default,backward,22,1,1,1,4,2529,3
-3932,einsum_default_270,call_function,einsum.default,backward,22,2,2,1,3658,2528,5
-3933,permute_476,call_function,permute.default,backward,22,1,1,1,3658,2,4
-3934,dtype_cast_302,call_function,dtype_cast.default,backward,22,1,1,1,3659,1,4
-3935,alias_default_1450,call_function,alias.default,backward,22,1,1,0,3660,0,3
-3936,alias_default_878,call_function,alias.default,backward,22,1,1,2,3659,2527,4
-3937,mul_304,call_function,mul.Tensor,backward,22,2,2,1,3660,2515,8
-3938,mul_305,call_function,mul.Tensor,backward,22,2,2,1,3660,2519,8
-3939,alias_default_879,call_function,alias.default,backward,22,1,1,2,3661,2514,4
-3940,einsum_default_271,call_function,einsum.default,backward,22,2,2,1,3662,3,5
-3941,permute_479,call_function,permute.default,backward,22,1,1,1,4,2510,3
-3942,einsum_default_272,call_function,einsum.default,backward,22,2,2,1,3663,2509,5
-3943,permute_480,call_function,permute.default,backward,22,1,1,1,3663,2,4
-3944,dtype_cast_303,call_function,dtype_cast.default,backward,22,1,1,1,3664,1,4
-3945,alias_default_1451,call_function,alias.default,backward,22,1,1,0,3665,0,3
-3946,convert_element_type_918,call_function,convert_element_type.default,backward,22,1,1,1,3661,2518,6
-3947,convert_element_type_919,call_function,convert_element_type.default,backward,22,1,1,1,2516,2528,4
-3948,alias_default_880,call_function,alias.default,backward,22,1,1,2,2517,2527,4
-3949,neg_33,call_function,neg.default,backward,22,1,1,1,2518,2526,8
-3950,exp_33,call_function,exp.default,backward,22,1,1,1,2519,2525,6
-3951,add_176,call_function,add.Tensor,backward,22,1,1,1,2520,2524,4
-3952,reciprocal_5,call_function,reciprocal.default,backward,22,1,1,1,2521,2523,4
-3953,mul_306,call_function,mul.Tensor,backward,22,1,1,1,2522,2522,6
-3954,alias_default_881,call_function,alias.default,backward,22,1,1,2,2523,2521,4
-3955,mul_307,call_function,mul.Tensor,backward,22,2,2,1,3670,2517,8
-3956,sub_16,call_function,sub.Tensor,backward,22,1,1,1,2524,2519,4
-3957,mul_308,call_function,mul.Tensor,backward,22,2,2,1,2525,2518,8
-3958,add_177,call_function,add.Tensor,backward,22,1,1,1,2526,2517,4
-3959,mul_309,call_function,mul.Tensor,backward,22,2,2,1,3674,2516,8
-3960,convert_element_type_920,call_function,convert_element_type.default,backward,22,1,1,1,3675,2515,6
-3961,alias_default_882,call_function,alias.default,backward,22,1,1,2,3676,2514,4
-3962,einsum_default_273,call_function,einsum.default,backward,22,2,2,1,3677,3,5
-3963,permute_483,call_function,permute.default,backward,22,1,1,1,4,2510,3
-3964,einsum_default_274,call_function,einsum.default,backward,22,2,2,1,3678,2509,5
-3965,add_178,call_function,add.Tensor,unknown,,2,2,1,3683,2508,10
-3966,permute_484,call_function,permute.default,backward,22,1,1,1,3678,2,4
-3967,dtype_cast_304,call_function,dtype_cast.default,backward,22,1,1,1,3679,1,4
-3968,alias_default_1449,call_function,alias.default,backward,22,1,1,0,3680,0,3
-3969,convert_element_type_925,call_function,convert_element_type.default,backward,22,1,1,1,3684,2507,8
-3970,convert_element_type_926,call_function,convert_element_type.default,backward,22,1,1,1,2496,2507,4
-3971,convert_element_type_927,call_function,convert_element_type.default,backward,22,1,1,1,3,2501,2
-3972,alias_default_883,call_function,alias.default,backward,22,1,1,2,3685,2506,4
-3973,mul_310,call_function,mul.Tensor,backward,22,2,2,1,3687,2500,8
-3974,mul_311,call_function,mul.Tensor,backward,22,2,2,1,2504,2506,8
-3975,alias_default_884,call_function,alias.default,backward,22,1,1,2,3688,2499,4
-3976,alias_default_885,call_function,alias.default,backward,22,1,1,3,2505,2505,4
-3977,mul_312,call_function,mul.Tensor,backward,22,2,2,1,3692,2498,8
-3978,sum_23,call_function,sum.dim_IntList,backward,22,1,1,1,3693,2497,5
-3979,div_39,call_function,div.Tensor,backward,22,1,1,1,2506,2497,6
-3980,mul_313,call_function,mul.Tensor,backward,22,2,2,1,3695,2496,8
-3981,sub_17,call_function,sub.Tensor,backward,22,2,2,1,3696,2495,10
-3982,mul_314,call_function,mul.Tensor,backward,22,2,2,1,3697,2494,8
-3983,mul_315,call_function,mul.Tensor,backward,22,2,2,1,3689,4,8
-3984,sum_24,call_function,sum.dim_IntList,backward,22,1,1,1,3690,3,5
-3985,convert_element_type_928,call_function,convert_element_type.default,backward,22,1,1,1,3698,2493,6
-3986,convert_element_type_929,call_function,convert_element_type.default,backward,22,1,1,1,3691,2,3
-3987,add_179,call_function,add.Tensor,unknown,,2,2,1,3699,2492,10
-3988,dtype_cast_305,call_function,dtype_cast.default,backward,22,1,1,1,3692,1,3
-3989,alias_default_1453,call_function,alias.default,backward,22,1,1,0,3693,0,2
-3990,alias_default_886,call_function,alias.default,unknown,,1,1,3,3700,2491,4
-3991,einsum_default_275,call_function,einsum.default,backward,22,2,2,1,3701,3,5
-3992,permute_487,call_function,permute.default,backward,22,1,1,1,4,2487,3
-3993,einsum_default_276,call_function,einsum.default,backward,22,2,2,1,3702,2486,5
-3994,permute_488,call_function,permute.default,backward,22,1,1,1,3702,2,4
-3995,dtype_cast_306,call_function,dtype_cast.default,backward,22,1,1,1,3703,1,4
-3996,alias_default_1448,call_function,alias.default,backward,22,1,1,0,3704,0,3
-3997,view_766,call_function,view.default,backward,22,1,1,1,3703,2485,4
-3998,permute_489,call_function,permute.default,backward,22,1,1,1,3704,2484,4
-3999,_scaled_dot_product_flash_attention_backward_5,call_function,_scaled_dot_product_flash_attention_backward.default,backward,22,8,8,3,3708,2483,2
-4000,getitem_267,call_function,getitem,backward,22,1,1,1,3709,2456,2
-4001,getitem_268,call_function,getitem,backward,22,1,1,1,3709,2457,2
-4002,getitem_269,call_function,getitem,backward,22,1,1,1,3709,2450,2
-4003,permute_490,call_function,permute.default,backward,22,1,1,1,3710,2449,2
-4004,permute_491,call_function,permute.default,backward,22,1,1,1,3710,2456,2
-4005,permute_492,call_function,permute.default,backward,22,1,1,1,3710,2455,2
-4006,convert_element_type_934,call_function,convert_element_type.default,backward,22,1,1,1,3711,2455,2
-4007,convert_element_type_935,call_function,convert_element_type.default,backward,22,1,1,1,3711,2454,2
-4008,view_767,call_function,view.default,backward,22,1,1,1,3712,2454,2
-4009,view_as_complex_66,call_function,view_as_complex.default,backward,22,1,1,1,3713,2453,6
-4010,_conj_10,call_function,_conj.default,backward,22,1,1,1,4,2454,3
-4011,clone_46,call_function,clone.default,backward,22,1,1,1,5,2453,3
-4012,mul_316,call_function,mul.Tensor,backward,22,2,2,1,3716,2452,8
-4013,view_768,call_function,view.default,backward,22,1,1,1,3712,2453,2
-4014,view_as_complex_67,call_function,view_as_complex.default,backward,22,1,1,1,3713,2452,6
-4015,_conj_11,call_function,_conj.default,backward,22,1,1,1,4,2453,3
-4016,clone_47,call_function,clone.default,backward,22,1,1,1,5,2452,3
-4017,mul_317,call_function,mul.Tensor,backward,22,2,2,1,3716,2451,8
-4018,view_as_real_66,call_function,view_as_real.default,backward,22,1,1,1,3717,2451,6
-4019,view_769,call_function,view.default,backward,22,1,1,1,3718,2450,6
-4020,convert_element_type_936,call_function,convert_element_type.default,backward,22,1,1,1,3719,2449,6
-4021,view_as_real_67,call_function,view_as_real.default,backward,22,1,1,1,3717,2450,6
-4022,view_770,call_function,view.default,backward,22,1,1,1,3718,2449,6
-4023,convert_element_type_937,call_function,convert_element_type.default,backward,22,1,1,1,3719,2448,6
-4024,view_771,call_function,view.default,backward,22,1,1,1,3711,2448,2
-4025,view_772,call_function,view.default,backward,22,1,1,1,3720,2448,5
-4026,view_773,call_function,view.default,backward,22,1,1,1,3720,2447,5
-4027,alias_default_887,call_function,alias.default,backward,22,1,1,2,3712,2447,4
-4028,einsum_default_277,call_function,einsum.default,backward,22,2,2,1,3713,3,5
-4029,permute_495,call_function,permute.default,backward,22,1,1,1,4,2443,3
-4030,einsum_default_278,call_function,einsum.default,backward,22,2,2,1,3714,2442,5
-4031,permute_496,call_function,permute.default,backward,22,1,1,1,3714,2,4
-4032,dtype_cast_307,call_function,dtype_cast.default,backward,22,1,1,1,3715,1,4
-4033,alias_default_1447,call_function,alias.default,backward,22,1,1,0,3716,0,3
-4034,alias_default_888,call_function,alias.default,backward,22,1,1,2,3721,2447,4
-4035,einsum_default_279,call_function,einsum.default,backward,22,2,2,1,3722,3,5
-4036,permute_499,call_function,permute.default,backward,22,1,1,1,4,2443,3
-4037,einsum_default_280,call_function,einsum.default,backward,22,2,2,1,3723,2442,5
-4038,add_180,call_function,add.Tensor,unknown,,2,2,1,3730,2441,10
-4039,permute_500,call_function,permute.default,backward,22,1,1,1,3723,2,4
-4040,dtype_cast_308,call_function,dtype_cast.default,backward,22,1,1,1,3724,1,4
-4041,alias_default_1446,call_function,alias.default,backward,22,1,1,0,3725,0,3
-4042,alias_default_889,call_function,alias.default,backward,22,1,1,2,3721,2446,4
-4043,einsum_default_281,call_function,einsum.default,backward,22,2,2,1,3722,3,5
-4044,permute_503,call_function,permute.default,backward,22,1,1,1,4,2442,3
-4045,einsum_default_282,call_function,einsum.default,backward,22,2,2,1,3723,2441,5
-4046,add_181,call_function,add.Tensor,unknown,,2,2,1,3746,2440,10
-4047,permute_504,call_function,permute.default,backward,22,1,1,1,3723,2,4
-4048,dtype_cast_309,call_function,dtype_cast.default,backward,22,1,1,1,3724,1,4
-4049,alias_default_1445,call_function,alias.default,backward,22,1,1,0,3725,0,3
-4050,convert_element_type_950,call_function,convert_element_type.default,backward,22,1,1,1,3747,2439,8
-4051,convert_element_type_951,call_function,convert_element_type.default,backward,22,1,1,1,2429,2439,4
-4052,convert_element_type_952,call_function,convert_element_type.default,backward,22,1,1,1,3,2433,2
-4053,alias_default_890,call_function,alias.default,backward,22,1,1,2,3748,2438,4
-4054,mul_318,call_function,mul.Tensor,backward,22,2,2,1,3750,2432,8
-4055,mul_319,call_function,mul.Tensor,backward,22,2,2,1,2437,2438,8
-4056,alias_default_891,call_function,alias.default,backward,22,1,1,2,3751,2431,4
-4057,alias_default_892,call_function,alias.default,backward,22,1,1,3,2438,2437,4
-4058,mul_320,call_function,mul.Tensor,backward,22,2,2,1,3755,2430,8
-4059,sum_25,call_function,sum.dim_IntList,backward,22,1,1,1,3756,2429,5
-4060,div_40,call_function,div.Tensor,backward,22,1,1,1,2439,2429,6
-4061,mul_321,call_function,mul.Tensor,backward,22,2,2,1,3758,2428,8
-4062,sub_18,call_function,sub.Tensor,backward,22,2,2,1,3759,2427,10
-4063,mul_322,call_function,mul.Tensor,backward,22,2,2,1,3760,2426,8
-4064,mul_323,call_function,mul.Tensor,backward,22,2,2,1,3752,4,8
-4065,sum_26,call_function,sum.dim_IntList,backward,22,1,1,1,3753,3,5
-4066,convert_element_type_953,call_function,convert_element_type.default,backward,22,1,1,1,3761,2425,6
-4067,convert_element_type_954,call_function,convert_element_type.default,backward,22,1,1,1,3754,2,3
-4068,add_182,call_function,add.Tensor,unknown,,2,2,1,3762,2424,10
-4069,dtype_cast_310,call_function,dtype_cast.default,backward,22,1,1,1,3755,1,3
-4070,alias_default_1452,call_function,alias.default,backward,22,1,1,0,3756,0,2
-4071,alias_default_893,call_function,alias.default,unknown,,1,1,3,3763,2423,4
-4072,einsum_default_283,call_function,einsum.default,backward,21,2,2,1,3764,3,5
-4073,permute_507,call_function,permute.default,backward,21,1,1,1,4,2419,3
-4074,einsum_default_284,call_function,einsum.default,backward,21,2,2,1,3765,2418,5
-4075,permute_508,call_function,permute.default,backward,21,1,1,1,3765,2,4
-4076,dtype_cast_311,call_function,dtype_cast.default,backward,21,1,1,1,3766,1,4
-4077,alias_default_1441,call_function,alias.default,backward,21,1,1,0,3767,0,3
-4078,alias_default_894,call_function,alias.default,backward,21,1,1,2,3766,2417,4
-4079,mul_324,call_function,mul.Tensor,backward,21,2,2,1,3767,2405,8
-4080,mul_325,call_function,mul.Tensor,backward,21,2,2,1,3767,2409,8
-4081,alias_default_895,call_function,alias.default,backward,21,1,1,2,3768,2404,4
-4082,einsum_default_285,call_function,einsum.default,backward,21,2,2,1,3769,3,5
-4083,permute_511,call_function,permute.default,backward,21,1,1,1,4,2400,3
-4084,einsum_default_286,call_function,einsum.default,backward,21,2,2,1,3770,2399,5
-4085,permute_512,call_function,permute.default,backward,21,1,1,1,3770,2,4
-4086,dtype_cast_312,call_function,dtype_cast.default,backward,21,1,1,1,3771,1,4
-4087,alias_default_1442,call_function,alias.default,backward,21,1,1,0,3772,0,3
-4088,convert_element_type_963,call_function,convert_element_type.default,backward,21,1,1,1,3768,2408,6
-4089,convert_element_type_964,call_function,convert_element_type.default,backward,21,1,1,1,2406,2418,4
-4090,alias_default_896,call_function,alias.default,backward,21,1,1,2,2407,2417,4
-4091,neg_34,call_function,neg.default,backward,21,1,1,1,2408,2416,8
-4092,exp_34,call_function,exp.default,backward,21,1,1,1,2409,2415,6
-4093,add_183,call_function,add.Tensor,backward,21,1,1,1,2410,2414,4
-4094,reciprocal_6,call_function,reciprocal.default,backward,21,1,1,1,2411,2413,4
-4095,mul_326,call_function,mul.Tensor,backward,21,1,1,1,2412,2412,6
-4096,alias_default_897,call_function,alias.default,backward,21,1,1,2,2413,2411,4
-4097,mul_327,call_function,mul.Tensor,backward,21,2,2,1,3777,2407,8
-4098,sub_19,call_function,sub.Tensor,backward,21,1,1,1,2414,2409,4
-4099,mul_328,call_function,mul.Tensor,backward,21,2,2,1,2415,2408,8
-4100,add_184,call_function,add.Tensor,backward,21,1,1,1,2416,2407,4
-4101,mul_329,call_function,mul.Tensor,backward,21,2,2,1,3781,2406,8
-4102,convert_element_type_965,call_function,convert_element_type.default,backward,21,1,1,1,3782,2405,6
-4103,alias_default_898,call_function,alias.default,backward,21,1,1,2,3783,2404,4
-4104,einsum_default_287,call_function,einsum.default,backward,21,2,2,1,3784,3,5
-4105,permute_515,call_function,permute.default,backward,21,1,1,1,4,2400,3
-4106,einsum_default_288,call_function,einsum.default,backward,21,2,2,1,3785,2399,5
-4107,add_185,call_function,add.Tensor,unknown,,2,2,1,3790,2398,10
-4108,permute_516,call_function,permute.default,backward,21,1,1,1,3785,2,4
-4109,dtype_cast_313,call_function,dtype_cast.default,backward,21,1,1,1,3786,1,4
-4110,alias_default_1440,call_function,alias.default,backward,21,1,1,0,3787,0,3
-4111,convert_element_type_970,call_function,convert_element_type.default,backward,21,1,1,1,3791,2397,8
-4112,convert_element_type_971,call_function,convert_element_type.default,backward,21,1,1,1,2386,2397,4
-4113,convert_element_type_972,call_function,convert_element_type.default,backward,21,1,1,1,3,2391,2
-4114,alias_default_899,call_function,alias.default,backward,21,1,1,2,3792,2396,4
-4115,mul_330,call_function,mul.Tensor,backward,21,2,2,1,3794,2390,8
-4116,mul_331,call_function,mul.Tensor,backward,21,2,2,1,2394,2396,8
-4117,alias_default_900,call_function,alias.default,backward,21,1,1,2,3795,2389,4
-4118,alias_default_901,call_function,alias.default,backward,21,1,1,3,2395,2395,4
-4119,mul_332,call_function,mul.Tensor,backward,21,2,2,1,3799,2388,8
-4120,sum_27,call_function,sum.dim_IntList,backward,21,1,1,1,3800,2387,5
-4121,div_41,call_function,div.Tensor,backward,21,1,1,1,2396,2387,6
-4122,mul_333,call_function,mul.Tensor,backward,21,2,2,1,3802,2386,8
-4123,sub_20,call_function,sub.Tensor,backward,21,2,2,1,3803,2385,10
-4124,mul_334,call_function,mul.Tensor,backward,21,2,2,1,3804,2384,8
-4125,mul_335,call_function,mul.Tensor,backward,21,2,2,1,3796,4,8
-4126,sum_28,call_function,sum.dim_IntList,backward,21,1,1,1,3797,3,5
-4127,convert_element_type_973,call_function,convert_element_type.default,backward,21,1,1,1,3805,2383,6
-4128,convert_element_type_974,call_function,convert_element_type.default,backward,21,1,1,1,3798,2,3
-4129,add_186,call_function,add.Tensor,unknown,,2,2,1,3806,2382,10
-4130,dtype_cast_314,call_function,dtype_cast.default,backward,21,1,1,1,3799,1,3
-4131,alias_default_1444,call_function,alias.default,backward,21,1,1,0,3800,0,2
-4132,alias_default_902,call_function,alias.default,unknown,,1,1,3,3807,2381,4
-4133,einsum_default_289,call_function,einsum.default,backward,21,2,2,1,3808,3,5
-4134,permute_519,call_function,permute.default,backward,21,1,1,1,4,2377,3
-4135,einsum_default_290,call_function,einsum.default,backward,21,2,2,1,3809,2376,5
-4136,permute_520,call_function,permute.default,backward,21,1,1,1,3809,2,4
-4137,dtype_cast_315,call_function,dtype_cast.default,backward,21,1,1,1,3810,1,4
-4138,alias_default_1439,call_function,alias.default,backward,21,1,1,0,3811,0,3
-4139,view_788,call_function,view.default,backward,21,1,1,1,3810,2375,4
-4140,permute_521,call_function,permute.default,backward,21,1,1,1,3811,2374,4
-4141,_scaled_dot_product_flash_attention_backward_6,call_function,_scaled_dot_product_flash_attention_backward.default,backward,21,8,8,3,3815,2373,2
-4142,getitem_270,call_function,getitem,backward,21,1,1,1,3816,2346,2
-4143,getitem_271,call_function,getitem,backward,21,1,1,1,3816,2347,2
-4144,getitem_272,call_function,getitem,backward,21,1,1,1,3816,2340,2
-4145,permute_522,call_function,permute.default,backward,21,1,1,1,3817,2339,2
-4146,permute_523,call_function,permute.default,backward,21,1,1,1,3817,2346,2
-4147,permute_524,call_function,permute.default,backward,21,1,1,1,3817,2345,2
-4148,convert_element_type_979,call_function,convert_element_type.default,backward,21,1,1,1,3818,2345,2
-4149,convert_element_type_980,call_function,convert_element_type.default,backward,21,1,1,1,3818,2344,2
-4150,view_789,call_function,view.default,backward,21,1,1,1,3819,2344,2
-4151,view_as_complex_68,call_function,view_as_complex.default,backward,21,1,1,1,3820,2343,6
-4152,_conj_12,call_function,_conj.default,backward,21,1,1,1,4,2344,3
-4153,clone_54,call_function,clone.default,backward,21,1,1,1,5,2343,3
-4154,mul_336,call_function,mul.Tensor,backward,21,2,2,1,3823,2342,8
-4155,view_790,call_function,view.default,backward,21,1,1,1,3819,2343,2
-4156,view_as_complex_69,call_function,view_as_complex.default,backward,21,1,1,1,3820,2342,6
-4157,_conj_13,call_function,_conj.default,backward,21,1,1,1,4,2343,3
-4158,clone_55,call_function,clone.default,backward,21,1,1,1,5,2342,3
-4159,mul_337,call_function,mul.Tensor,backward,21,2,2,1,3823,2341,8
-4160,view_as_real_68,call_function,view_as_real.default,backward,21,1,1,1,3824,2341,6
-4161,view_791,call_function,view.default,backward,21,1,1,1,3825,2340,6
-4162,convert_element_type_981,call_function,convert_element_type.default,backward,21,1,1,1,3826,2339,6
-4163,view_as_real_69,call_function,view_as_real.default,backward,21,1,1,1,3824,2340,6
-4164,view_792,call_function,view.default,backward,21,1,1,1,3825,2339,6
-4165,convert_element_type_982,call_function,convert_element_type.default,backward,21,1,1,1,3826,2338,6
-4166,view_793,call_function,view.default,backward,21,1,1,1,3818,2338,2
-4167,view_794,call_function,view.default,backward,21,1,1,1,3827,2338,5
-4168,view_795,call_function,view.default,backward,21,1,1,1,3827,2337,5
-4169,alias_default_903,call_function,alias.default,backward,21,1,1,2,3819,2337,4
-4170,einsum_default_291,call_function,einsum.default,backward,21,2,2,1,3820,3,5
-4171,permute_527,call_function,permute.default,backward,21,1,1,1,4,2333,3
-4172,einsum_default_292,call_function,einsum.default,backward,21,2,2,1,3821,2332,5
-4173,permute_528,call_function,permute.default,backward,21,1,1,1,3821,2,4
-4174,dtype_cast_316,call_function,dtype_cast.default,backward,21,1,1,1,3822,1,4
-4175,alias_default_1438,call_function,alias.default,backward,21,1,1,0,3823,0,3
-4176,alias_default_904,call_function,alias.default,backward,21,1,1,2,3828,2337,4
-4177,einsum_default_293,call_function,einsum.default,backward,21,2,2,1,3829,3,5
-4178,permute_531,call_function,permute.default,backward,21,1,1,1,4,2333,3
-4179,einsum_default_294,call_function,einsum.default,backward,21,2,2,1,3830,2332,5
-4180,add_187,call_function,add.Tensor,unknown,,2,2,1,3837,2331,10
-4181,permute_532,call_function,permute.default,backward,21,1,1,1,3830,2,4
-4182,dtype_cast_317,call_function,dtype_cast.default,backward,21,1,1,1,3831,1,4
-4183,alias_default_1437,call_function,alias.default,backward,21,1,1,0,3832,0,3
-4184,alias_default_905,call_function,alias.default,backward,21,1,1,2,3828,2336,4
-4185,einsum_default_295,call_function,einsum.default,backward,21,2,2,1,3829,3,5
-4186,permute_535,call_function,permute.default,backward,21,1,1,1,4,2332,3
-4187,einsum_default_296,call_function,einsum.default,backward,21,2,2,1,3830,2331,5
-4188,add_188,call_function,add.Tensor,unknown,,2,2,1,3853,2330,10
-4189,permute_536,call_function,permute.default,backward,21,1,1,1,3830,2,4
-4190,dtype_cast_318,call_function,dtype_cast.default,backward,21,1,1,1,3831,1,4
-4191,alias_default_1436,call_function,alias.default,backward,21,1,1,0,3832,0,3
-4192,convert_element_type_995,call_function,convert_element_type.default,backward,21,1,1,1,3854,2329,8
-4193,convert_element_type_996,call_function,convert_element_type.default,backward,21,1,1,1,2319,2329,4
-4194,convert_element_type_997,call_function,convert_element_type.default,backward,21,1,1,1,3,2323,2
-4195,alias_default_906,call_function,alias.default,backward,21,1,1,2,3855,2328,4
-4196,mul_338,call_function,mul.Tensor,backward,21,2,2,1,3857,2322,8
-4197,mul_339,call_function,mul.Tensor,backward,21,2,2,1,2327,2328,8
-4198,alias_default_907,call_function,alias.default,backward,21,1,1,2,3858,2321,4
-4199,alias_default_908,call_function,alias.default,backward,21,1,1,3,2328,2327,4
-4200,mul_340,call_function,mul.Tensor,backward,21,2,2,1,3862,2320,8
-4201,sum_29,call_function,sum.dim_IntList,backward,21,1,1,1,3863,2319,5
-4202,div_42,call_function,div.Tensor,backward,21,1,1,1,2329,2319,6
-4203,mul_341,call_function,mul.Tensor,backward,21,2,2,1,3865,2318,8
-4204,sub_21,call_function,sub.Tensor,backward,21,2,2,1,3866,2317,10
-4205,mul_342,call_function,mul.Tensor,backward,21,2,2,1,3867,2316,8
-4206,mul_343,call_function,mul.Tensor,backward,21,2,2,1,3859,4,8
-4207,sum_30,call_function,sum.dim_IntList,backward,21,1,1,1,3860,3,5
-4208,convert_element_type_998,call_function,convert_element_type.default,backward,21,1,1,1,3868,2315,6
-4209,convert_element_type_999,call_function,convert_element_type.default,backward,21,1,1,1,3861,2,3
-4210,add_189,call_function,add.Tensor,unknown,,2,2,1,3869,2314,10
-4211,dtype_cast_319,call_function,dtype_cast.default,backward,21,1,1,1,3862,1,3
-4212,alias_default_1443,call_function,alias.default,backward,21,1,1,0,3863,0,2
-4213,alias_default_909,call_function,alias.default,unknown,,1,1,3,3870,2313,4
-4214,einsum_default_297,call_function,einsum.default,backward,20,2,2,1,3871,3,5
-4215,permute_539,call_function,permute.default,backward,20,1,1,1,4,2309,3
-4216,einsum_default_298,call_function,einsum.default,backward,20,2,2,1,3872,2308,5
-4217,permute_540,call_function,permute.default,backward,20,1,1,1,3872,2,4
-4218,dtype_cast_320,call_function,dtype_cast.default,backward,20,1,1,1,3873,1,4
-4219,alias_default_1432,call_function,alias.default,backward,20,1,1,0,3874,0,3
-4220,alias_default_910,call_function,alias.default,backward,20,1,1,2,3873,2307,4
-4221,mul_344,call_function,mul.Tensor,backward,20,2,2,1,3874,2295,8
-4222,mul_345,call_function,mul.Tensor,backward,20,2,2,1,3874,2299,8
-4223,alias_default_911,call_function,alias.default,backward,20,1,1,2,3875,2294,4
-4224,einsum_default_299,call_function,einsum.default,backward,20,2,2,1,3876,3,5
-4225,permute_543,call_function,permute.default,backward,20,1,1,1,4,2290,3
-4226,einsum_default_300,call_function,einsum.default,backward,20,2,2,1,3877,2289,5
-4227,permute_544,call_function,permute.default,backward,20,1,1,1,3877,2,4
-4228,dtype_cast_321,call_function,dtype_cast.default,backward,20,1,1,1,3878,1,4
-4229,alias_default_1433,call_function,alias.default,backward,20,1,1,0,3879,0,3
-4230,convert_element_type_1008,call_function,convert_element_type.default,backward,20,1,1,1,3875,2298,6
-4231,convert_element_type_1009,call_function,convert_element_type.default,backward,20,1,1,1,2296,2308,4
-4232,alias_default_912,call_function,alias.default,backward,20,1,1,2,2297,2307,4
-4233,neg_35,call_function,neg.default,backward,20,1,1,1,2298,2306,8
-4234,exp_35,call_function,exp.default,backward,20,1,1,1,2299,2305,6
-4235,add_190,call_function,add.Tensor,backward,20,1,1,1,2300,2304,4
-4236,reciprocal_7,call_function,reciprocal.default,backward,20,1,1,1,2301,2303,4
-4237,mul_346,call_function,mul.Tensor,backward,20,1,1,1,2302,2302,6
-4238,alias_default_913,call_function,alias.default,backward,20,1,1,2,2303,2301,4
-4239,mul_347,call_function,mul.Tensor,backward,20,2,2,1,3884,2297,8
-4240,sub_22,call_function,sub.Tensor,backward,20,1,1,1,2304,2299,4
-4241,mul_348,call_function,mul.Tensor,backward,20,2,2,1,2305,2298,8
-4242,add_191,call_function,add.Tensor,backward,20,1,1,1,2306,2297,4
-4243,mul_349,call_function,mul.Tensor,backward,20,2,2,1,3888,2296,8
-4244,convert_element_type_1010,call_function,convert_element_type.default,backward,20,1,1,1,3889,2295,6
-4245,alias_default_914,call_function,alias.default,backward,20,1,1,2,3890,2294,4
-4246,einsum_default_301,call_function,einsum.default,backward,20,2,2,1,3891,3,5
-4247,permute_547,call_function,permute.default,backward,20,1,1,1,4,2290,3
-4248,einsum_default_302,call_function,einsum.default,backward,20,2,2,1,3892,2289,5
-4249,add_192,call_function,add.Tensor,unknown,,2,2,1,3897,2288,10
-4250,permute_548,call_function,permute.default,backward,20,1,1,1,3892,2,4
-4251,dtype_cast_322,call_function,dtype_cast.default,backward,20,1,1,1,3893,1,4
-4252,alias_default_1431,call_function,alias.default,backward,20,1,1,0,3894,0,3
-4253,convert_element_type_1015,call_function,convert_element_type.default,backward,20,1,1,1,3898,2287,8
-4254,convert_element_type_1016,call_function,convert_element_type.default,backward,20,1,1,1,2276,2287,4
-4255,convert_element_type_1017,call_function,convert_element_type.default,backward,20,1,1,1,3,2281,2
-4256,alias_default_915,call_function,alias.default,backward,20,1,1,2,3899,2286,4
-4257,mul_350,call_function,mul.Tensor,backward,20,2,2,1,3901,2280,8
-4258,mul_351,call_function,mul.Tensor,backward,20,2,2,1,2284,2286,8
-4259,alias_default_916,call_function,alias.default,backward,20,1,1,2,3902,2279,4
-4260,alias_default_917,call_function,alias.default,backward,20,1,1,3,2285,2285,4
-4261,mul_352,call_function,mul.Tensor,backward,20,2,2,1,3906,2278,8
-4262,sum_31,call_function,sum.dim_IntList,backward,20,1,1,1,3907,2277,5
-4263,div_43,call_function,div.Tensor,backward,20,1,1,1,2286,2277,6
-4264,mul_353,call_function,mul.Tensor,backward,20,2,2,1,3909,2276,8
-4265,sub_23,call_function,sub.Tensor,backward,20,2,2,1,3910,2275,10
-4266,mul_354,call_function,mul.Tensor,backward,20,2,2,1,3911,2274,8
-4267,mul_355,call_function,mul.Tensor,backward,20,2,2,1,3903,4,8
-4268,sum_32,call_function,sum.dim_IntList,backward,20,1,1,1,3904,3,5
-4269,convert_element_type_1018,call_function,convert_element_type.default,backward,20,1,1,1,3912,2273,6
-4270,convert_element_type_1019,call_function,convert_element_type.default,backward,20,1,1,1,3905,2,3
-4271,add_193,call_function,add.Tensor,unknown,,2,2,1,3913,2272,10
-4272,dtype_cast_323,call_function,dtype_cast.default,backward,20,1,1,1,3906,1,3
-4273,alias_default_1435,call_function,alias.default,backward,20,1,1,0,3907,0,2
-4274,alias_default_918,call_function,alias.default,unknown,,1,1,3,3914,2271,4
-4275,einsum_default_303,call_function,einsum.default,backward,20,2,2,1,3915,3,5
-4276,permute_551,call_function,permute.default,backward,20,1,1,1,4,2267,3
-4277,einsum_default_304,call_function,einsum.default,backward,20,2,2,1,3916,2266,5
-4278,permute_552,call_function,permute.default,backward,20,1,1,1,3916,2,4
-4279,dtype_cast_324,call_function,dtype_cast.default,backward,20,1,1,1,3917,1,4
-4280,alias_default_1430,call_function,alias.default,backward,20,1,1,0,3918,0,3
-4281,view_810,call_function,view.default,backward,20,1,1,1,3917,2265,4
-4282,permute_553,call_function,permute.default,backward,20,1,1,1,3918,2264,4
-4283,_scaled_dot_product_flash_attention_backward_7,call_function,_scaled_dot_product_flash_attention_backward.default,backward,20,8,8,3,3922,2263,2
-4284,getitem_273,call_function,getitem,backward,20,1,1,1,3923,2236,2
-4285,getitem_274,call_function,getitem,backward,20,1,1,1,3923,2237,2
-4286,getitem_275,call_function,getitem,backward,20,1,1,1,3923,2230,2
-4287,permute_554,call_function,permute.default,backward,20,1,1,1,3924,2229,2
-4288,permute_555,call_function,permute.default,backward,20,1,1,1,3924,2236,2
-4289,permute_556,call_function,permute.default,backward,20,1,1,1,3924,2235,2
-4290,convert_element_type_1024,call_function,convert_element_type.default,backward,20,1,1,1,3925,2235,2
-4291,convert_element_type_1025,call_function,convert_element_type.default,backward,20,1,1,1,3925,2234,2
-4292,view_811,call_function,view.default,backward,20,1,1,1,3926,2234,2
-4293,view_as_complex_70,call_function,view_as_complex.default,backward,20,1,1,1,3927,2233,6
-4294,_conj_14,call_function,_conj.default,backward,20,1,1,1,4,2234,3
-4295,clone_62,call_function,clone.default,backward,20,1,1,1,5,2233,3
-4296,mul_356,call_function,mul.Tensor,backward,20,2,2,1,3930,2232,8
-4297,view_812,call_function,view.default,backward,20,1,1,1,3926,2233,2
-4298,view_as_complex_71,call_function,view_as_complex.default,backward,20,1,1,1,3927,2232,6
-4299,_conj_15,call_function,_conj.default,backward,20,1,1,1,4,2233,3
-4300,clone_63,call_function,clone.default,backward,20,1,1,1,5,2232,3
-4301,mul_357,call_function,mul.Tensor,backward,20,2,2,1,3930,2231,8
-4302,view_as_real_70,call_function,view_as_real.default,backward,20,1,1,1,3931,2231,6
-4303,view_813,call_function,view.default,backward,20,1,1,1,3932,2230,6
-4304,convert_element_type_1026,call_function,convert_element_type.default,backward,20,1,1,1,3933,2229,6
-4305,view_as_real_71,call_function,view_as_real.default,backward,20,1,1,1,3931,2230,6
-4306,view_814,call_function,view.default,backward,20,1,1,1,3932,2229,6
-4307,convert_element_type_1027,call_function,convert_element_type.default,backward,20,1,1,1,3933,2228,6
-4308,view_815,call_function,view.default,backward,20,1,1,1,3925,2228,2
-4309,view_816,call_function,view.default,backward,20,1,1,1,3934,2228,5
-4310,view_817,call_function,view.default,backward,20,1,1,1,3934,2227,5
-4311,alias_default_919,call_function,alias.default,backward,20,1,1,2,3926,2227,4
-4312,einsum_default_305,call_function,einsum.default,backward,20,2,2,1,3927,3,5
-4313,permute_559,call_function,permute.default,backward,20,1,1,1,4,2223,3
-4314,einsum_default_306,call_function,einsum.default,backward,20,2,2,1,3928,2222,5
-4315,permute_560,call_function,permute.default,backward,20,1,1,1,3928,2,4
-4316,dtype_cast_325,call_function,dtype_cast.default,backward,20,1,1,1,3929,1,4
-4317,alias_default_1429,call_function,alias.default,backward,20,1,1,0,3930,0,3
-4318,alias_default_920,call_function,alias.default,backward,20,1,1,2,3935,2227,4
-4319,einsum_default_307,call_function,einsum.default,backward,20,2,2,1,3936,3,5
-4320,permute_563,call_function,permute.default,backward,20,1,1,1,4,2223,3
-4321,einsum_default_308,call_function,einsum.default,backward,20,2,2,1,3937,2222,5
-4322,add_194,call_function,add.Tensor,unknown,,2,2,1,3944,2221,10
-4323,permute_564,call_function,permute.default,backward,20,1,1,1,3937,2,4
-4324,dtype_cast_326,call_function,dtype_cast.default,backward,20,1,1,1,3938,1,4
-4325,alias_default_1428,call_function,alias.default,backward,20,1,1,0,3939,0,3
-4326,alias_default_921,call_function,alias.default,backward,20,1,1,2,3935,2226,4
-4327,einsum_default_309,call_function,einsum.default,backward,20,2,2,1,3936,3,5
-4328,permute_567,call_function,permute.default,backward,20,1,1,1,4,2222,3
-4329,einsum_default_310,call_function,einsum.default,backward,20,2,2,1,3937,2221,5
-4330,add_195,call_function,add.Tensor,unknown,,2,2,1,3960,2220,10
-4331,permute_568,call_function,permute.default,backward,20,1,1,1,3937,2,4
-4332,dtype_cast_327,call_function,dtype_cast.default,backward,20,1,1,1,3938,1,4
-4333,alias_default_1427,call_function,alias.default,backward,20,1,1,0,3939,0,3
-4334,convert_element_type_1040,call_function,convert_element_type.default,backward,20,1,1,1,3961,2219,8
-4335,convert_element_type_1041,call_function,convert_element_type.default,backward,20,1,1,1,2209,2219,4
-4336,convert_element_type_1042,call_function,convert_element_type.default,backward,20,1,1,1,3,2213,2
-4337,alias_default_922,call_function,alias.default,backward,20,1,1,2,3962,2218,4
-4338,mul_358,call_function,mul.Tensor,backward,20,2,2,1,3964,2212,8
-4339,mul_359,call_function,mul.Tensor,backward,20,2,2,1,2217,2218,8
-4340,alias_default_923,call_function,alias.default,backward,20,1,1,2,3965,2211,4
-4341,alias_default_924,call_function,alias.default,backward,20,1,1,3,2218,2217,4
-4342,mul_360,call_function,mul.Tensor,backward,20,2,2,1,3969,2210,8
-4343,sum_33,call_function,sum.dim_IntList,backward,20,1,1,1,3970,2209,5
-4344,div_44,call_function,div.Tensor,backward,20,1,1,1,2219,2209,6
-4345,mul_361,call_function,mul.Tensor,backward,20,2,2,1,3972,2208,8
-4346,sub_24,call_function,sub.Tensor,backward,20,2,2,1,3973,2207,10
-4347,mul_362,call_function,mul.Tensor,backward,20,2,2,1,3974,2206,8
-4348,mul_363,call_function,mul.Tensor,backward,20,2,2,1,3966,4,8
-4349,sum_34,call_function,sum.dim_IntList,backward,20,1,1,1,3967,3,5
-4350,convert_element_type_1043,call_function,convert_element_type.default,backward,20,1,1,1,3975,2205,6
-4351,convert_element_type_1044,call_function,convert_element_type.default,backward,20,1,1,1,3968,2,3
-4352,add_196,call_function,add.Tensor,unknown,,2,2,1,3976,2204,10
-4353,dtype_cast_328,call_function,dtype_cast.default,backward,20,1,1,1,3969,1,3
-4354,alias_default_1434,call_function,alias.default,backward,20,1,1,0,3970,0,2
-4355,alias_default_925,call_function,alias.default,unknown,,1,1,3,3977,2203,4
-4356,einsum_default_311,call_function,einsum.default,backward,19,2,2,1,3978,3,5
-4357,permute_571,call_function,permute.default,backward,19,1,1,1,4,2199,3
-4358,einsum_default_312,call_function,einsum.default,backward,19,2,2,1,3979,2198,5
-4359,permute_572,call_function,permute.default,backward,19,1,1,1,3979,2,4
-4360,dtype_cast_329,call_function,dtype_cast.default,backward,19,1,1,1,3980,1,4
-4361,alias_default_1423,call_function,alias.default,backward,19,1,1,0,3981,0,3
-4362,alias_default_926,call_function,alias.default,backward,19,1,1,2,3980,2197,4
-4363,mul_364,call_function,mul.Tensor,backward,19,2,2,1,3981,2185,8
-4364,mul_365,call_function,mul.Tensor,backward,19,2,2,1,3981,2189,8
-4365,alias_default_927,call_function,alias.default,backward,19,1,1,2,3982,2184,4
-4366,einsum_default_313,call_function,einsum.default,backward,19,2,2,1,3983,3,5
-4367,permute_575,call_function,permute.default,backward,19,1,1,1,4,2180,3
-4368,einsum_default_314,call_function,einsum.default,backward,19,2,2,1,3984,2179,5
-4369,permute_576,call_function,permute.default,backward,19,1,1,1,3984,2,4
-4370,dtype_cast_330,call_function,dtype_cast.default,backward,19,1,1,1,3985,1,4
-4371,alias_default_1424,call_function,alias.default,backward,19,1,1,0,3986,0,3
-4372,convert_element_type_1053,call_function,convert_element_type.default,backward,19,1,1,1,3982,2188,6
-4373,convert_element_type_1054,call_function,convert_element_type.default,backward,19,1,1,1,2186,2198,4
-4374,alias_default_928,call_function,alias.default,backward,19,1,1,2,2187,2197,4
-4375,neg_36,call_function,neg.default,backward,19,1,1,1,2188,2196,8
-4376,exp_36,call_function,exp.default,backward,19,1,1,1,2189,2195,6
-4377,add_197,call_function,add.Tensor,backward,19,1,1,1,2190,2194,4
-4378,reciprocal_8,call_function,reciprocal.default,backward,19,1,1,1,2191,2193,4
-4379,mul_366,call_function,mul.Tensor,backward,19,1,1,1,2192,2192,6
-4380,alias_default_929,call_function,alias.default,backward,19,1,1,2,2193,2191,4
-4381,mul_367,call_function,mul.Tensor,backward,19,2,2,1,3991,2187,8
-4382,sub_25,call_function,sub.Tensor,backward,19,1,1,1,2194,2189,4
-4383,mul_368,call_function,mul.Tensor,backward,19,2,2,1,2195,2188,8
-4384,add_198,call_function,add.Tensor,backward,19,1,1,1,2196,2187,4
-4385,mul_369,call_function,mul.Tensor,backward,19,2,2,1,3995,2186,8
-4386,convert_element_type_1055,call_function,convert_element_type.default,backward,19,1,1,1,3996,2185,6
-4387,alias_default_930,call_function,alias.default,backward,19,1,1,2,3997,2184,4
-4388,einsum_default_315,call_function,einsum.default,backward,19,2,2,1,3998,3,5
-4389,permute_579,call_function,permute.default,backward,19,1,1,1,4,2180,3
-4390,einsum_default_316,call_function,einsum.default,backward,19,2,2,1,3999,2179,5
-4391,add_199,call_function,add.Tensor,unknown,,2,2,1,4004,2178,10
-4392,permute_580,call_function,permute.default,backward,19,1,1,1,3999,2,4
-4393,dtype_cast_331,call_function,dtype_cast.default,backward,19,1,1,1,4000,1,4
-4394,alias_default_1422,call_function,alias.default,backward,19,1,1,0,4001,0,3
-4395,convert_element_type_1060,call_function,convert_element_type.default,backward,19,1,1,1,4005,2177,8
-4396,convert_element_type_1061,call_function,convert_element_type.default,backward,19,1,1,1,2166,2177,4
-4397,convert_element_type_1062,call_function,convert_element_type.default,backward,19,1,1,1,3,2171,2
-4398,alias_default_931,call_function,alias.default,backward,19,1,1,2,4006,2176,4
-4399,mul_370,call_function,mul.Tensor,backward,19,2,2,1,4008,2170,8
-4400,mul_371,call_function,mul.Tensor,backward,19,2,2,1,2174,2176,8
-4401,alias_default_932,call_function,alias.default,backward,19,1,1,2,4009,2169,4
-4402,alias_default_933,call_function,alias.default,backward,19,1,1,3,2175,2175,4
-4403,mul_372,call_function,mul.Tensor,backward,19,2,2,1,4013,2168,8
-4404,sum_35,call_function,sum.dim_IntList,backward,19,1,1,1,4014,2167,5
-4405,div_45,call_function,div.Tensor,backward,19,1,1,1,2176,2167,6
-4406,mul_373,call_function,mul.Tensor,backward,19,2,2,1,4016,2166,8
-4407,sub_26,call_function,sub.Tensor,backward,19,2,2,1,4017,2165,10
-4408,mul_374,call_function,mul.Tensor,backward,19,2,2,1,4018,2164,8
-4409,mul_375,call_function,mul.Tensor,backward,19,2,2,1,4010,4,8
-4410,sum_36,call_function,sum.dim_IntList,backward,19,1,1,1,4011,3,5
-4411,convert_element_type_1063,call_function,convert_element_type.default,backward,19,1,1,1,4019,2163,6
-4412,convert_element_type_1064,call_function,convert_element_type.default,backward,19,1,1,1,4012,2,3
-4413,add_200,call_function,add.Tensor,unknown,,2,2,1,4020,2162,10
-4414,dtype_cast_332,call_function,dtype_cast.default,backward,19,1,1,1,4013,1,3
-4415,alias_default_1426,call_function,alias.default,backward,19,1,1,0,4014,0,2
-4416,alias_default_934,call_function,alias.default,unknown,,1,1,3,4021,2161,4
-4417,einsum_default_317,call_function,einsum.default,backward,19,2,2,1,4022,3,5
-4418,permute_583,call_function,permute.default,backward,19,1,1,1,4,2157,3
-4419,einsum_default_318,call_function,einsum.default,backward,19,2,2,1,4023,2156,5
-4420,permute_584,call_function,permute.default,backward,19,1,1,1,4023,2,4
-4421,dtype_cast_333,call_function,dtype_cast.default,backward,19,1,1,1,4024,1,4
-4422,alias_default_1421,call_function,alias.default,backward,19,1,1,0,4025,0,3
-4423,view_832,call_function,view.default,backward,19,1,1,1,4024,2155,4
-4424,permute_585,call_function,permute.default,backward,19,1,1,1,4025,2154,4
-4425,_scaled_dot_product_flash_attention_backward_8,call_function,_scaled_dot_product_flash_attention_backward.default,backward,19,8,8,3,4029,2153,2
-4426,getitem_276,call_function,getitem,backward,19,1,1,1,4030,2126,2
-4427,getitem_277,call_function,getitem,backward,19,1,1,1,4030,2127,2
-4428,getitem_278,call_function,getitem,backward,19,1,1,1,4030,2120,2
-4429,permute_586,call_function,permute.default,backward,19,1,1,1,4031,2119,2
-4430,permute_587,call_function,permute.default,backward,19,1,1,1,4031,2126,2
-4431,permute_588,call_function,permute.default,backward,19,1,1,1,4031,2125,2
-4432,convert_element_type_1069,call_function,convert_element_type.default,backward,19,1,1,1,4032,2125,2
-4433,convert_element_type_1070,call_function,convert_element_type.default,backward,19,1,1,1,4032,2124,2
-4434,view_833,call_function,view.default,backward,19,1,1,1,4033,2124,2
-4435,view_as_complex_72,call_function,view_as_complex.default,backward,19,1,1,1,4034,2123,6
-4436,_conj_16,call_function,_conj.default,backward,19,1,1,1,4,2124,3
-4437,clone_70,call_function,clone.default,backward,19,1,1,1,5,2123,3
-4438,mul_376,call_function,mul.Tensor,backward,19,2,2,1,4037,2122,8
-4439,view_834,call_function,view.default,backward,19,1,1,1,4033,2123,2
-4440,view_as_complex_73,call_function,view_as_complex.default,backward,19,1,1,1,4034,2122,6
-4441,_conj_17,call_function,_conj.default,backward,19,1,1,1,4,2123,3
-4442,clone_71,call_function,clone.default,backward,19,1,1,1,5,2122,3
-4443,mul_377,call_function,mul.Tensor,backward,19,2,2,1,4037,2121,8
-4444,view_as_real_72,call_function,view_as_real.default,backward,19,1,1,1,4038,2121,6
-4445,view_835,call_function,view.default,backward,19,1,1,1,4039,2120,6
-4446,convert_element_type_1071,call_function,convert_element_type.default,backward,19,1,1,1,4040,2119,6
-4447,view_as_real_73,call_function,view_as_real.default,backward,19,1,1,1,4038,2120,6
-4448,view_836,call_function,view.default,backward,19,1,1,1,4039,2119,6
-4449,convert_element_type_1072,call_function,convert_element_type.default,backward,19,1,1,1,4040,2118,6
-4450,view_837,call_function,view.default,backward,19,1,1,1,4032,2118,2
-4451,view_838,call_function,view.default,backward,19,1,1,1,4041,2118,5
-4452,view_839,call_function,view.default,backward,19,1,1,1,4041,2117,5
-4453,alias_default_935,call_function,alias.default,backward,19,1,1,2,4033,2117,4
-4454,einsum_default_319,call_function,einsum.default,backward,19,2,2,1,4034,3,5
-4455,permute_591,call_function,permute.default,backward,19,1,1,1,4,2113,3
-4456,einsum_default_320,call_function,einsum.default,backward,19,2,2,1,4035,2112,5
-4457,permute_592,call_function,permute.default,backward,19,1,1,1,4035,2,4
-4458,dtype_cast_334,call_function,dtype_cast.default,backward,19,1,1,1,4036,1,4
-4459,alias_default_1420,call_function,alias.default,backward,19,1,1,0,4037,0,3
-4460,alias_default_936,call_function,alias.default,backward,19,1,1,2,4042,2117,4
-4461,einsum_default_321,call_function,einsum.default,backward,19,2,2,1,4043,3,5
-4462,permute_595,call_function,permute.default,backward,19,1,1,1,4,2113,3
-4463,einsum_default_322,call_function,einsum.default,backward,19,2,2,1,4044,2112,5
-4464,add_201,call_function,add.Tensor,unknown,,2,2,1,4051,2111,10
-4465,permute_596,call_function,permute.default,backward,19,1,1,1,4044,2,4
-4466,dtype_cast_335,call_function,dtype_cast.default,backward,19,1,1,1,4045,1,4
-4467,alias_default_1419,call_function,alias.default,backward,19,1,1,0,4046,0,3
-4468,alias_default_937,call_function,alias.default,backward,19,1,1,2,4042,2116,4
-4469,einsum_default_323,call_function,einsum.default,backward,19,2,2,1,4043,3,5
-4470,permute_599,call_function,permute.default,backward,19,1,1,1,4,2112,3
-4471,einsum_default_324,call_function,einsum.default,backward,19,2,2,1,4044,2111,5
-4472,add_202,call_function,add.Tensor,unknown,,2,2,1,4067,2110,10
-4473,permute_600,call_function,permute.default,backward,19,1,1,1,4044,2,4
-4474,dtype_cast_336,call_function,dtype_cast.default,backward,19,1,1,1,4045,1,4
-4475,alias_default_1418,call_function,alias.default,backward,19,1,1,0,4046,0,3
-4476,convert_element_type_1085,call_function,convert_element_type.default,backward,19,1,1,1,4068,2109,8
-4477,convert_element_type_1086,call_function,convert_element_type.default,backward,19,1,1,1,2099,2109,4
-4478,convert_element_type_1087,call_function,convert_element_type.default,backward,19,1,1,1,3,2103,2
-4479,alias_default_938,call_function,alias.default,backward,19,1,1,2,4069,2108,4
-4480,mul_378,call_function,mul.Tensor,backward,19,2,2,1,4071,2102,8
-4481,mul_379,call_function,mul.Tensor,backward,19,2,2,1,2107,2108,8
-4482,alias_default_939,call_function,alias.default,backward,19,1,1,2,4072,2101,4
-4483,alias_default_940,call_function,alias.default,backward,19,1,1,3,2108,2107,4
-4484,mul_380,call_function,mul.Tensor,backward,19,2,2,1,4076,2100,8
-4485,sum_37,call_function,sum.dim_IntList,backward,19,1,1,1,4077,2099,5
-4486,div_46,call_function,div.Tensor,backward,19,1,1,1,2109,2099,6
-4487,mul_381,call_function,mul.Tensor,backward,19,2,2,1,4079,2098,8
-4488,sub_27,call_function,sub.Tensor,backward,19,2,2,1,4080,2097,10
-4489,mul_382,call_function,mul.Tensor,backward,19,2,2,1,4081,2096,8
-4490,mul_383,call_function,mul.Tensor,backward,19,2,2,1,4073,4,8
-4491,sum_38,call_function,sum.dim_IntList,backward,19,1,1,1,4074,3,5
-4492,convert_element_type_1088,call_function,convert_element_type.default,backward,19,1,1,1,4082,2095,6
-4493,convert_element_type_1089,call_function,convert_element_type.default,backward,19,1,1,1,4075,2,3
-4494,add_203,call_function,add.Tensor,unknown,,2,2,1,4083,2094,10
-4495,dtype_cast_337,call_function,dtype_cast.default,backward,19,1,1,1,4076,1,3
-4496,alias_default_1425,call_function,alias.default,backward,19,1,1,0,4077,0,2
-4497,alias_default_941,call_function,alias.default,unknown,,1,1,3,4084,2093,4
-4498,einsum_default_325,call_function,einsum.default,backward,18,2,2,1,4085,3,5
-4499,permute_603,call_function,permute.default,backward,18,1,1,1,4,2089,3
-4500,einsum_default_326,call_function,einsum.default,backward,18,2,2,1,4086,2088,5
-4501,permute_604,call_function,permute.default,backward,18,1,1,1,4086,2,4
-4502,dtype_cast_338,call_function,dtype_cast.default,backward,18,1,1,1,4087,1,4
-4503,alias_default_1414,call_function,alias.default,backward,18,1,1,0,4088,0,3
-4504,alias_default_942,call_function,alias.default,backward,18,1,1,2,4087,2087,4
-4505,mul_384,call_function,mul.Tensor,backward,18,2,2,1,4088,2075,8
-4506,mul_385,call_function,mul.Tensor,backward,18,2,2,1,4088,2079,8
-4507,alias_default_943,call_function,alias.default,backward,18,1,1,2,4089,2074,4
-4508,einsum_default_327,call_function,einsum.default,backward,18,2,2,1,4090,3,5
-4509,permute_607,call_function,permute.default,backward,18,1,1,1,4,2070,3
-4510,einsum_default_328,call_function,einsum.default,backward,18,2,2,1,4091,2069,5
-4511,permute_608,call_function,permute.default,backward,18,1,1,1,4091,2,4
-4512,dtype_cast_339,call_function,dtype_cast.default,backward,18,1,1,1,4092,1,4
-4513,alias_default_1415,call_function,alias.default,backward,18,1,1,0,4093,0,3
-4514,convert_element_type_1098,call_function,convert_element_type.default,backward,18,1,1,1,4089,2078,6
-4515,convert_element_type_1099,call_function,convert_element_type.default,backward,18,1,1,1,2076,2088,4
-4516,alias_default_944,call_function,alias.default,backward,18,1,1,2,2077,2087,4
-4517,neg_37,call_function,neg.default,backward,18,1,1,1,2078,2086,8
-4518,exp_37,call_function,exp.default,backward,18,1,1,1,2079,2085,6
-4519,add_204,call_function,add.Tensor,backward,18,1,1,1,2080,2084,4
-4520,reciprocal_9,call_function,reciprocal.default,backward,18,1,1,1,2081,2083,4
-4521,mul_386,call_function,mul.Tensor,backward,18,1,1,1,2082,2082,6
-4522,alias_default_945,call_function,alias.default,backward,18,1,1,2,2083,2081,4
-4523,mul_387,call_function,mul.Tensor,backward,18,2,2,1,4098,2077,8
-4524,sub_28,call_function,sub.Tensor,backward,18,1,1,1,2084,2079,4
-4525,mul_388,call_function,mul.Tensor,backward,18,2,2,1,2085,2078,8
-4526,add_205,call_function,add.Tensor,backward,18,1,1,1,2086,2077,4
-4527,mul_389,call_function,mul.Tensor,backward,18,2,2,1,4102,2076,8
-4528,convert_element_type_1100,call_function,convert_element_type.default,backward,18,1,1,1,4103,2075,6
-4529,alias_default_946,call_function,alias.default,backward,18,1,1,2,4104,2074,4
-4530,einsum_default_329,call_function,einsum.default,backward,18,2,2,1,4105,3,5
-4531,permute_611,call_function,permute.default,backward,18,1,1,1,4,2070,3
-4532,einsum_default_330,call_function,einsum.default,backward,18,2,2,1,4106,2069,5
-4533,add_206,call_function,add.Tensor,unknown,,2,2,1,4111,2068,10
-4534,permute_612,call_function,permute.default,backward,18,1,1,1,4106,2,4
-4535,dtype_cast_340,call_function,dtype_cast.default,backward,18,1,1,1,4107,1,4
-4536,alias_default_1413,call_function,alias.default,backward,18,1,1,0,4108,0,3
-4537,convert_element_type_1105,call_function,convert_element_type.default,backward,18,1,1,1,4112,2067,8
-4538,convert_element_type_1106,call_function,convert_element_type.default,backward,18,1,1,1,2056,2067,4
-4539,convert_element_type_1107,call_function,convert_element_type.default,backward,18,1,1,1,3,2061,2
-4540,alias_default_947,call_function,alias.default,backward,18,1,1,2,4113,2066,4
-4541,mul_390,call_function,mul.Tensor,backward,18,2,2,1,4115,2060,8
-4542,mul_391,call_function,mul.Tensor,backward,18,2,2,1,2064,2066,8
-4543,alias_default_948,call_function,alias.default,backward,18,1,1,2,4116,2059,4
-4544,alias_default_949,call_function,alias.default,backward,18,1,1,3,2065,2065,4
-4545,mul_392,call_function,mul.Tensor,backward,18,2,2,1,4120,2058,8
-4546,sum_39,call_function,sum.dim_IntList,backward,18,1,1,1,4121,2057,5
-4547,div_47,call_function,div.Tensor,backward,18,1,1,1,2066,2057,6
-4548,mul_393,call_function,mul.Tensor,backward,18,2,2,1,4123,2056,8
-4549,sub_29,call_function,sub.Tensor,backward,18,2,2,1,4124,2055,10
-4550,mul_394,call_function,mul.Tensor,backward,18,2,2,1,4125,2054,8
-4551,mul_395,call_function,mul.Tensor,backward,18,2,2,1,4117,4,8
-4552,sum_40,call_function,sum.dim_IntList,backward,18,1,1,1,4118,3,5
-4553,convert_element_type_1108,call_function,convert_element_type.default,backward,18,1,1,1,4126,2053,6
-4554,convert_element_type_1109,call_function,convert_element_type.default,backward,18,1,1,1,4119,2,3
-4555,add_207,call_function,add.Tensor,unknown,,2,2,1,4127,2052,10
-4556,dtype_cast_341,call_function,dtype_cast.default,backward,18,1,1,1,4120,1,3
-4557,alias_default_1417,call_function,alias.default,backward,18,1,1,0,4121,0,2
-4558,alias_default_950,call_function,alias.default,unknown,,1,1,3,4128,2051,4
-4559,einsum_default_331,call_function,einsum.default,backward,18,2,2,1,4129,3,5
-4560,permute_615,call_function,permute.default,backward,18,1,1,1,4,2047,3
-4561,einsum_default_332,call_function,einsum.default,backward,18,2,2,1,4130,2046,5
-4562,permute_616,call_function,permute.default,backward,18,1,1,1,4130,2,4
-4563,dtype_cast_342,call_function,dtype_cast.default,backward,18,1,1,1,4131,1,4
-4564,alias_default_1412,call_function,alias.default,backward,18,1,1,0,4132,0,3
-4565,view_854,call_function,view.default,backward,18,1,1,1,4131,2045,4
-4566,permute_617,call_function,permute.default,backward,18,1,1,1,4132,2044,4
-4567,_scaled_dot_product_flash_attention_backward_9,call_function,_scaled_dot_product_flash_attention_backward.default,backward,18,8,8,3,4136,2043,2
-4568,getitem_279,call_function,getitem,backward,18,1,1,1,4137,2016,2
-4569,getitem_280,call_function,getitem,backward,18,1,1,1,4137,2017,2
-4570,getitem_281,call_function,getitem,backward,18,1,1,1,4137,2010,2
-4571,permute_618,call_function,permute.default,backward,18,1,1,1,4138,2009,2
-4572,permute_619,call_function,permute.default,backward,18,1,1,1,4138,2016,2
-4573,permute_620,call_function,permute.default,backward,18,1,1,1,4138,2015,2
-4574,convert_element_type_1114,call_function,convert_element_type.default,backward,18,1,1,1,4139,2015,2
-4575,convert_element_type_1115,call_function,convert_element_type.default,backward,18,1,1,1,4139,2014,2
-4576,view_855,call_function,view.default,backward,18,1,1,1,4140,2014,2
-4577,view_as_complex_74,call_function,view_as_complex.default,backward,18,1,1,1,4141,2013,6
-4578,_conj_18,call_function,_conj.default,backward,18,1,1,1,4,2014,3
-4579,clone_78,call_function,clone.default,backward,18,1,1,1,5,2013,3
-4580,mul_396,call_function,mul.Tensor,backward,18,2,2,1,4144,2012,8
-4581,view_856,call_function,view.default,backward,18,1,1,1,4140,2013,2
-4582,view_as_complex_75,call_function,view_as_complex.default,backward,18,1,1,1,4141,2012,6
-4583,_conj_19,call_function,_conj.default,backward,18,1,1,1,4,2013,3
-4584,clone_79,call_function,clone.default,backward,18,1,1,1,5,2012,3
-4585,mul_397,call_function,mul.Tensor,backward,18,2,2,1,4144,2011,8
-4586,view_as_real_74,call_function,view_as_real.default,backward,18,1,1,1,4145,2011,6
-4587,view_857,call_function,view.default,backward,18,1,1,1,4146,2010,6
-4588,convert_element_type_1116,call_function,convert_element_type.default,backward,18,1,1,1,4147,2009,6
-4589,view_as_real_75,call_function,view_as_real.default,backward,18,1,1,1,4145,2010,6
-4590,view_858,call_function,view.default,backward,18,1,1,1,4146,2009,6
-4591,convert_element_type_1117,call_function,convert_element_type.default,backward,18,1,1,1,4147,2008,6
-4592,view_859,call_function,view.default,backward,18,1,1,1,4139,2008,2
-4593,view_860,call_function,view.default,backward,18,1,1,1,4148,2008,5
-4594,view_861,call_function,view.default,backward,18,1,1,1,4148,2007,5
-4595,alias_default_951,call_function,alias.default,backward,18,1,1,2,4140,2007,4
-4596,einsum_default_333,call_function,einsum.default,backward,18,2,2,1,4141,3,5
-4597,permute_623,call_function,permute.default,backward,18,1,1,1,4,2003,3
-4598,einsum_default_334,call_function,einsum.default,backward,18,2,2,1,4142,2002,5
-4599,permute_624,call_function,permute.default,backward,18,1,1,1,4142,2,4
-4600,dtype_cast_343,call_function,dtype_cast.default,backward,18,1,1,1,4143,1,4
-4601,alias_default_1411,call_function,alias.default,backward,18,1,1,0,4144,0,3
-4602,alias_default_952,call_function,alias.default,backward,18,1,1,2,4149,2007,4
-4603,einsum_default_335,call_function,einsum.default,backward,18,2,2,1,4150,3,5
-4604,permute_627,call_function,permute.default,backward,18,1,1,1,4,2003,3
-4605,einsum_default_336,call_function,einsum.default,backward,18,2,2,1,4151,2002,5
-4606,add_208,call_function,add.Tensor,unknown,,2,2,1,4158,2001,10
-4607,permute_628,call_function,permute.default,backward,18,1,1,1,4151,2,4
-4608,dtype_cast_344,call_function,dtype_cast.default,backward,18,1,1,1,4152,1,4
-4609,alias_default_1410,call_function,alias.default,backward,18,1,1,0,4153,0,3
-4610,alias_default_953,call_function,alias.default,backward,18,1,1,2,4149,2006,4
-4611,einsum_default_337,call_function,einsum.default,backward,18,2,2,1,4150,3,5
-4612,permute_631,call_function,permute.default,backward,18,1,1,1,4,2002,3
-4613,einsum_default_338,call_function,einsum.default,backward,18,2,2,1,4151,2001,5
-4614,add_209,call_function,add.Tensor,unknown,,2,2,1,4174,2000,10
-4615,permute_632,call_function,permute.default,backward,18,1,1,1,4151,2,4
-4616,dtype_cast_345,call_function,dtype_cast.default,backward,18,1,1,1,4152,1,4
-4617,alias_default_1409,call_function,alias.default,backward,18,1,1,0,4153,0,3
-4618,convert_element_type_1130,call_function,convert_element_type.default,backward,18,1,1,1,4175,1999,8
-4619,convert_element_type_1131,call_function,convert_element_type.default,backward,18,1,1,1,1989,1999,4
-4620,convert_element_type_1132,call_function,convert_element_type.default,backward,18,1,1,1,3,1993,2
-4621,alias_default_954,call_function,alias.default,backward,18,1,1,2,4176,1998,4
-4622,mul_398,call_function,mul.Tensor,backward,18,2,2,1,4178,1992,8
-4623,mul_399,call_function,mul.Tensor,backward,18,2,2,1,1997,1998,8
-4624,alias_default_955,call_function,alias.default,backward,18,1,1,2,4179,1991,4
-4625,alias_default_956,call_function,alias.default,backward,18,1,1,3,1998,1997,4
-4626,mul_400,call_function,mul.Tensor,backward,18,2,2,1,4183,1990,8
-4627,sum_41,call_function,sum.dim_IntList,backward,18,1,1,1,4184,1989,5
-4628,div_48,call_function,div.Tensor,backward,18,1,1,1,1999,1989,6
-4629,mul_401,call_function,mul.Tensor,backward,18,2,2,1,4186,1988,8
-4630,sub_30,call_function,sub.Tensor,backward,18,2,2,1,4187,1987,10
-4631,mul_402,call_function,mul.Tensor,backward,18,2,2,1,4188,1986,8
-4632,mul_403,call_function,mul.Tensor,backward,18,2,2,1,4180,4,8
-4633,sum_42,call_function,sum.dim_IntList,backward,18,1,1,1,4181,3,5
-4634,convert_element_type_1133,call_function,convert_element_type.default,backward,18,1,1,1,4189,1985,6
-4635,convert_element_type_1134,call_function,convert_element_type.default,backward,18,1,1,1,4182,2,3
-4636,add_210,call_function,add.Tensor,unknown,,2,2,1,4190,1984,10
-4637,dtype_cast_346,call_function,dtype_cast.default,backward,18,1,1,1,4183,1,3
-4638,alias_default_1416,call_function,alias.default,backward,18,1,1,0,4184,0,2
-4639,alias_default_957,call_function,alias.default,unknown,,1,1,3,4191,1983,4
-4640,einsum_default_339,call_function,einsum.default,backward,17,2,2,1,4192,3,5
-4641,permute_635,call_function,permute.default,backward,17,1,1,1,4,1979,3
-4642,einsum_default_340,call_function,einsum.default,backward,17,2,2,1,4193,1978,5
-4643,permute_636,call_function,permute.default,backward,17,1,1,1,4193,2,4
-4644,dtype_cast_347,call_function,dtype_cast.default,backward,17,1,1,1,4194,1,4
-4645,alias_default_1405,call_function,alias.default,backward,17,1,1,0,4195,0,3
-4646,alias_default_958,call_function,alias.default,backward,17,1,1,2,4194,1977,4
-4647,mul_404,call_function,mul.Tensor,backward,17,2,2,1,4195,1965,8
-4648,mul_405,call_function,mul.Tensor,backward,17,2,2,1,4195,1969,8
-4649,alias_default_959,call_function,alias.default,backward,17,1,1,2,4196,1964,4
-4650,einsum_default_341,call_function,einsum.default,backward,17,2,2,1,4197,3,5
-4651,permute_639,call_function,permute.default,backward,17,1,1,1,4,1960,3
-4652,einsum_default_342,call_function,einsum.default,backward,17,2,2,1,4198,1959,5
-4653,permute_640,call_function,permute.default,backward,17,1,1,1,4198,2,4
-4654,dtype_cast_348,call_function,dtype_cast.default,backward,17,1,1,1,4199,1,4
-4655,alias_default_1406,call_function,alias.default,backward,17,1,1,0,4200,0,3
-4656,convert_element_type_1143,call_function,convert_element_type.default,backward,17,1,1,1,4196,1968,6
-4657,convert_element_type_1144,call_function,convert_element_type.default,backward,17,1,1,1,1966,1978,4
-4658,alias_default_960,call_function,alias.default,backward,17,1,1,2,1967,1977,4
-4659,neg_38,call_function,neg.default,backward,17,1,1,1,1968,1976,8
-4660,exp_38,call_function,exp.default,backward,17,1,1,1,1969,1975,6
-4661,add_211,call_function,add.Tensor,backward,17,1,1,1,1970,1974,4
-4662,reciprocal_10,call_function,reciprocal.default,backward,17,1,1,1,1971,1973,4
-4663,mul_406,call_function,mul.Tensor,backward,17,1,1,1,1972,1972,6
-4664,alias_default_961,call_function,alias.default,backward,17,1,1,2,1973,1971,4
-4665,mul_407,call_function,mul.Tensor,backward,17,2,2,1,4205,1967,8
-4666,sub_31,call_function,sub.Tensor,backward,17,1,1,1,1974,1969,4
-4667,mul_408,call_function,mul.Tensor,backward,17,2,2,1,1975,1968,8
-4668,add_212,call_function,add.Tensor,backward,17,1,1,1,1976,1967,4
-4669,mul_409,call_function,mul.Tensor,backward,17,2,2,1,4209,1966,8
-4670,convert_element_type_1145,call_function,convert_element_type.default,backward,17,1,1,1,4210,1965,6
-4671,alias_default_962,call_function,alias.default,backward,17,1,1,2,4211,1964,4
-4672,einsum_default_343,call_function,einsum.default,backward,17,2,2,1,4212,3,5
-4673,permute_643,call_function,permute.default,backward,17,1,1,1,4,1960,3
-4674,einsum_default_344,call_function,einsum.default,backward,17,2,2,1,4213,1959,5
-4675,add_213,call_function,add.Tensor,unknown,,2,2,1,4218,1958,10
-4676,permute_644,call_function,permute.default,backward,17,1,1,1,4213,2,4
-4677,dtype_cast_349,call_function,dtype_cast.default,backward,17,1,1,1,4214,1,4
-4678,alias_default_1404,call_function,alias.default,backward,17,1,1,0,4215,0,3
-4679,convert_element_type_1150,call_function,convert_element_type.default,backward,17,1,1,1,4219,1957,8
-4680,convert_element_type_1151,call_function,convert_element_type.default,backward,17,1,1,1,1946,1957,4
-4681,convert_element_type_1152,call_function,convert_element_type.default,backward,17,1,1,1,3,1951,2
-4682,alias_default_963,call_function,alias.default,backward,17,1,1,2,4220,1956,4
-4683,mul_410,call_function,mul.Tensor,backward,17,2,2,1,4222,1950,8
-4684,mul_411,call_function,mul.Tensor,backward,17,2,2,1,1954,1956,8
-4685,alias_default_964,call_function,alias.default,backward,17,1,1,2,4223,1949,4
-4686,alias_default_965,call_function,alias.default,backward,17,1,1,3,1955,1955,4
-4687,mul_412,call_function,mul.Tensor,backward,17,2,2,1,4227,1948,8
-4688,sum_43,call_function,sum.dim_IntList,backward,17,1,1,1,4228,1947,5
-4689,div_49,call_function,div.Tensor,backward,17,1,1,1,1956,1947,6
-4690,mul_413,call_function,mul.Tensor,backward,17,2,2,1,4230,1946,8
-4691,sub_32,call_function,sub.Tensor,backward,17,2,2,1,4231,1945,10
-4692,mul_414,call_function,mul.Tensor,backward,17,2,2,1,4232,1944,8
-4693,mul_415,call_function,mul.Tensor,backward,17,2,2,1,4224,4,8
-4694,sum_44,call_function,sum.dim_IntList,backward,17,1,1,1,4225,3,5
-4695,convert_element_type_1153,call_function,convert_element_type.default,backward,17,1,1,1,4233,1943,6
-4696,convert_element_type_1154,call_function,convert_element_type.default,backward,17,1,1,1,4226,2,3
-4697,add_214,call_function,add.Tensor,unknown,,2,2,1,4234,1942,10
-4698,dtype_cast_350,call_function,dtype_cast.default,backward,17,1,1,1,4227,1,3
-4699,alias_default_1408,call_function,alias.default,backward,17,1,1,0,4228,0,2
-4700,alias_default_966,call_function,alias.default,unknown,,1,1,3,4235,1941,4
-4701,einsum_default_345,call_function,einsum.default,backward,17,2,2,1,4236,3,5
-4702,permute_647,call_function,permute.default,backward,17,1,1,1,4,1937,3
-4703,einsum_default_346,call_function,einsum.default,backward,17,2,2,1,4237,1936,5
-4704,permute_648,call_function,permute.default,backward,17,1,1,1,4237,2,4
-4705,dtype_cast_351,call_function,dtype_cast.default,backward,17,1,1,1,4238,1,4
-4706,alias_default_1403,call_function,alias.default,backward,17,1,1,0,4239,0,3
-4707,view_876,call_function,view.default,backward,17,1,1,1,4238,1935,4
-4708,permute_649,call_function,permute.default,backward,17,1,1,1,4239,1934,4
-4709,_scaled_dot_product_flash_attention_backward_10,call_function,_scaled_dot_product_flash_attention_backward.default,backward,17,8,8,3,4243,1933,2
-4710,getitem_282,call_function,getitem,backward,17,1,1,1,4244,1906,2
-4711,getitem_283,call_function,getitem,backward,17,1,1,1,4244,1907,2
-4712,getitem_284,call_function,getitem,backward,17,1,1,1,4244,1900,2
-4713,permute_650,call_function,permute.default,backward,17,1,1,1,4245,1899,2
-4714,permute_651,call_function,permute.default,backward,17,1,1,1,4245,1906,2
-4715,permute_652,call_function,permute.default,backward,17,1,1,1,4245,1905,2
-4716,convert_element_type_1159,call_function,convert_element_type.default,backward,17,1,1,1,4246,1905,2
-4717,convert_element_type_1160,call_function,convert_element_type.default,backward,17,1,1,1,4246,1904,2
-4718,view_877,call_function,view.default,backward,17,1,1,1,4247,1904,2
-4719,view_as_complex_76,call_function,view_as_complex.default,backward,17,1,1,1,4248,1903,6
-4720,_conj_20,call_function,_conj.default,backward,17,1,1,1,4,1904,3
-4721,clone_86,call_function,clone.default,backward,17,1,1,1,5,1903,3
-4722,mul_416,call_function,mul.Tensor,backward,17,2,2,1,4251,1902,8
-4723,view_878,call_function,view.default,backward,17,1,1,1,4247,1903,2
-4724,view_as_complex_77,call_function,view_as_complex.default,backward,17,1,1,1,4248,1902,6
-4725,_conj_21,call_function,_conj.default,backward,17,1,1,1,4,1903,3
-4726,clone_87,call_function,clone.default,backward,17,1,1,1,5,1902,3
-4727,mul_417,call_function,mul.Tensor,backward,17,2,2,1,4251,1901,8
-4728,view_as_real_76,call_function,view_as_real.default,backward,17,1,1,1,4252,1901,6
-4729,view_879,call_function,view.default,backward,17,1,1,1,4253,1900,6
-4730,convert_element_type_1161,call_function,convert_element_type.default,backward,17,1,1,1,4254,1899,6
-4731,view_as_real_77,call_function,view_as_real.default,backward,17,1,1,1,4252,1900,6
-4732,view_880,call_function,view.default,backward,17,1,1,1,4253,1899,6
-4733,convert_element_type_1162,call_function,convert_element_type.default,backward,17,1,1,1,4254,1898,6
-4734,view_881,call_function,view.default,backward,17,1,1,1,4246,1898,2
-4735,view_882,call_function,view.default,backward,17,1,1,1,4255,1898,5
-4736,view_883,call_function,view.default,backward,17,1,1,1,4255,1897,5
-4737,alias_default_967,call_function,alias.default,backward,17,1,1,2,4247,1897,4
-4738,einsum_default_347,call_function,einsum.default,backward,17,2,2,1,4248,3,5
-4739,permute_655,call_function,permute.default,backward,17,1,1,1,4,1893,3
-4740,einsum_default_348,call_function,einsum.default,backward,17,2,2,1,4249,1892,5
-4741,permute_656,call_function,permute.default,backward,17,1,1,1,4249,2,4
-4742,dtype_cast_352,call_function,dtype_cast.default,backward,17,1,1,1,4250,1,4
-4743,alias_default_1402,call_function,alias.default,backward,17,1,1,0,4251,0,3
-4744,alias_default_968,call_function,alias.default,backward,17,1,1,2,4256,1897,4
-4745,einsum_default_349,call_function,einsum.default,backward,17,2,2,1,4257,3,5
-4746,permute_659,call_function,permute.default,backward,17,1,1,1,4,1893,3
-4747,einsum_default_350,call_function,einsum.default,backward,17,2,2,1,4258,1892,5
-4748,add_215,call_function,add.Tensor,unknown,,2,2,1,4265,1891,10
-4749,permute_660,call_function,permute.default,backward,17,1,1,1,4258,2,4
-4750,dtype_cast_353,call_function,dtype_cast.default,backward,17,1,1,1,4259,1,4
-4751,alias_default_1401,call_function,alias.default,backward,17,1,1,0,4260,0,3
-4752,alias_default_969,call_function,alias.default,backward,17,1,1,2,4256,1896,4
-4753,einsum_default_351,call_function,einsum.default,backward,17,2,2,1,4257,3,5
-4754,permute_663,call_function,permute.default,backward,17,1,1,1,4,1892,3
-4755,einsum_default_352,call_function,einsum.default,backward,17,2,2,1,4258,1891,5
-4756,add_216,call_function,add.Tensor,unknown,,2,2,1,4281,1890,10
-4757,permute_664,call_function,permute.default,backward,17,1,1,1,4258,2,4
-4758,dtype_cast_354,call_function,dtype_cast.default,backward,17,1,1,1,4259,1,4
-4759,alias_default_1400,call_function,alias.default,backward,17,1,1,0,4260,0,3
-4760,convert_element_type_1175,call_function,convert_element_type.default,backward,17,1,1,1,4282,1889,8
-4761,convert_element_type_1176,call_function,convert_element_type.default,backward,17,1,1,1,1879,1889,4
-4762,convert_element_type_1177,call_function,convert_element_type.default,backward,17,1,1,1,3,1883,2
-4763,alias_default_970,call_function,alias.default,backward,17,1,1,2,4283,1888,4
-4764,mul_418,call_function,mul.Tensor,backward,17,2,2,1,4285,1882,8
-4765,mul_419,call_function,mul.Tensor,backward,17,2,2,1,1887,1888,8
-4766,alias_default_971,call_function,alias.default,backward,17,1,1,2,4286,1881,4
-4767,alias_default_972,call_function,alias.default,backward,17,1,1,3,1888,1887,4
-4768,mul_420,call_function,mul.Tensor,backward,17,2,2,1,4290,1880,8
-4769,sum_45,call_function,sum.dim_IntList,backward,17,1,1,1,4291,1879,5
-4770,div_50,call_function,div.Tensor,backward,17,1,1,1,1889,1879,6
-4771,mul_421,call_function,mul.Tensor,backward,17,2,2,1,4293,1878,8
-4772,sub_33,call_function,sub.Tensor,backward,17,2,2,1,4294,1877,10
-4773,mul_422,call_function,mul.Tensor,backward,17,2,2,1,4295,1876,8
-4774,mul_423,call_function,mul.Tensor,backward,17,2,2,1,4287,4,8
-4775,sum_46,call_function,sum.dim_IntList,backward,17,1,1,1,4288,3,5
-4776,convert_element_type_1178,call_function,convert_element_type.default,backward,17,1,1,1,4296,1875,6
-4777,convert_element_type_1179,call_function,convert_element_type.default,backward,17,1,1,1,4289,2,3
-4778,add_217,call_function,add.Tensor,unknown,,2,2,1,4297,1874,10
-4779,dtype_cast_355,call_function,dtype_cast.default,backward,17,1,1,1,4290,1,3
-4780,alias_default_1407,call_function,alias.default,backward,17,1,1,0,4291,0,2
-4781,alias_default_973,call_function,alias.default,unknown,,1,1,3,4298,1873,4
-4782,einsum_default_353,call_function,einsum.default,backward,16,2,2,1,4299,3,5
-4783,permute_667,call_function,permute.default,backward,16,1,1,1,4,1869,3
-4784,einsum_default_354,call_function,einsum.default,backward,16,2,2,1,4300,1868,5
-4785,permute_668,call_function,permute.default,backward,16,1,1,1,4300,2,4
-4786,dtype_cast_356,call_function,dtype_cast.default,backward,16,1,1,1,4301,1,4
-4787,alias_default_1396,call_function,alias.default,backward,16,1,1,0,4302,0,3
-4788,alias_default_974,call_function,alias.default,backward,16,1,1,2,4301,1867,4
-4789,mul_424,call_function,mul.Tensor,backward,16,2,2,1,4302,1855,8
-4790,mul_425,call_function,mul.Tensor,backward,16,2,2,1,4302,1859,8
-4791,alias_default_975,call_function,alias.default,backward,16,1,1,2,4303,1854,4
-4792,einsum_default_355,call_function,einsum.default,backward,16,2,2,1,4304,3,5
-4793,permute_671,call_function,permute.default,backward,16,1,1,1,4,1850,3
-4794,einsum_default_356,call_function,einsum.default,backward,16,2,2,1,4305,1849,5
-4795,permute_672,call_function,permute.default,backward,16,1,1,1,4305,2,4
-4796,dtype_cast_357,call_function,dtype_cast.default,backward,16,1,1,1,4306,1,4
-4797,alias_default_1397,call_function,alias.default,backward,16,1,1,0,4307,0,3
-4798,convert_element_type_1188,call_function,convert_element_type.default,backward,16,1,1,1,4303,1858,6
-4799,convert_element_type_1189,call_function,convert_element_type.default,backward,16,1,1,1,1856,1868,4
-4800,alias_default_976,call_function,alias.default,backward,16,1,1,2,1857,1867,4
-4801,neg_39,call_function,neg.default,backward,16,1,1,1,1858,1866,8
-4802,exp_39,call_function,exp.default,backward,16,1,1,1,1859,1865,6
-4803,add_218,call_function,add.Tensor,backward,16,1,1,1,1860,1864,4
-4804,reciprocal_11,call_function,reciprocal.default,backward,16,1,1,1,1861,1863,4
-4805,mul_426,call_function,mul.Tensor,backward,16,1,1,1,1862,1862,6
-4806,alias_default_977,call_function,alias.default,backward,16,1,1,2,1863,1861,4
-4807,mul_427,call_function,mul.Tensor,backward,16,2,2,1,4312,1857,8
-4808,sub_34,call_function,sub.Tensor,backward,16,1,1,1,1864,1859,4
-4809,mul_428,call_function,mul.Tensor,backward,16,2,2,1,1865,1858,8
-4810,add_219,call_function,add.Tensor,backward,16,1,1,1,1866,1857,4
-4811,mul_429,call_function,mul.Tensor,backward,16,2,2,1,4316,1856,8
-4812,convert_element_type_1190,call_function,convert_element_type.default,backward,16,1,1,1,4317,1855,6
-4813,alias_default_978,call_function,alias.default,backward,16,1,1,2,4318,1854,4
-4814,einsum_default_357,call_function,einsum.default,backward,16,2,2,1,4319,3,5
-4815,permute_675,call_function,permute.default,backward,16,1,1,1,4,1850,3
-4816,einsum_default_358,call_function,einsum.default,backward,16,2,2,1,4320,1849,5
-4817,add_220,call_function,add.Tensor,unknown,,2,2,1,4325,1848,10
-4818,permute_676,call_function,permute.default,backward,16,1,1,1,4320,2,4
-4819,dtype_cast_358,call_function,dtype_cast.default,backward,16,1,1,1,4321,1,4
-4820,alias_default_1395,call_function,alias.default,backward,16,1,1,0,4322,0,3
-4821,convert_element_type_1195,call_function,convert_element_type.default,backward,16,1,1,1,4326,1847,8
-4822,convert_element_type_1196,call_function,convert_element_type.default,backward,16,1,1,1,1836,1847,4
-4823,convert_element_type_1197,call_function,convert_element_type.default,backward,16,1,1,1,3,1841,2
-4824,alias_default_979,call_function,alias.default,backward,16,1,1,2,4327,1846,4
-4825,mul_430,call_function,mul.Tensor,backward,16,2,2,1,4329,1840,8
-4826,mul_431,call_function,mul.Tensor,backward,16,2,2,1,1844,1846,8
-4827,alias_default_980,call_function,alias.default,backward,16,1,1,2,4330,1839,4
-4828,alias_default_981,call_function,alias.default,backward,16,1,1,3,1845,1845,4
-4829,mul_432,call_function,mul.Tensor,backward,16,2,2,1,4334,1838,8
-4830,sum_47,call_function,sum.dim_IntList,backward,16,1,1,1,4335,1837,5
-4831,div_51,call_function,div.Tensor,backward,16,1,1,1,1846,1837,6
-4832,mul_433,call_function,mul.Tensor,backward,16,2,2,1,4337,1836,8
-4833,sub_35,call_function,sub.Tensor,backward,16,2,2,1,4338,1835,10
-4834,mul_434,call_function,mul.Tensor,backward,16,2,2,1,4339,1834,8
-4835,mul_435,call_function,mul.Tensor,backward,16,2,2,1,4331,4,8
-4836,sum_48,call_function,sum.dim_IntList,backward,16,1,1,1,4332,3,5
-4837,convert_element_type_1198,call_function,convert_element_type.default,backward,16,1,1,1,4340,1833,6
-4838,convert_element_type_1199,call_function,convert_element_type.default,backward,16,1,1,1,4333,2,3
-4839,add_221,call_function,add.Tensor,unknown,,2,2,1,4341,1832,10
-4840,dtype_cast_359,call_function,dtype_cast.default,backward,16,1,1,1,4334,1,3
-4841,alias_default_1399,call_function,alias.default,backward,16,1,1,0,4335,0,2
-4842,alias_default_982,call_function,alias.default,unknown,,1,1,3,4342,1831,4
-4843,einsum_default_359,call_function,einsum.default,backward,16,2,2,1,4343,3,5
-4844,permute_679,call_function,permute.default,backward,16,1,1,1,4,1827,3
-4845,einsum_default_360,call_function,einsum.default,backward,16,2,2,1,4344,1826,5
-4846,permute_680,call_function,permute.default,backward,16,1,1,1,4344,2,4
-4847,dtype_cast_360,call_function,dtype_cast.default,backward,16,1,1,1,4345,1,4
-4848,alias_default_1394,call_function,alias.default,backward,16,1,1,0,4346,0,3
-4849,view_898,call_function,view.default,backward,16,1,1,1,4345,1825,4
-4850,permute_681,call_function,permute.default,backward,16,1,1,1,4346,1824,4
-4851,_scaled_dot_product_flash_attention_backward_11,call_function,_scaled_dot_product_flash_attention_backward.default,backward,16,8,8,3,4350,1823,2
-4852,getitem_285,call_function,getitem,backward,16,1,1,1,4351,1796,2
-4853,getitem_286,call_function,getitem,backward,16,1,1,1,4351,1797,2
-4854,getitem_287,call_function,getitem,backward,16,1,1,1,4351,1790,2
-4855,permute_682,call_function,permute.default,backward,16,1,1,1,4352,1789,2
-4856,permute_683,call_function,permute.default,backward,16,1,1,1,4352,1796,2
-4857,permute_684,call_function,permute.default,backward,16,1,1,1,4352,1795,2
-4858,convert_element_type_1204,call_function,convert_element_type.default,backward,16,1,1,1,4353,1795,2
-4859,convert_element_type_1205,call_function,convert_element_type.default,backward,16,1,1,1,4353,1794,2
-4860,view_899,call_function,view.default,backward,16,1,1,1,4354,1794,2
-4861,view_as_complex_78,call_function,view_as_complex.default,backward,16,1,1,1,4355,1793,6
-4862,_conj_22,call_function,_conj.default,backward,16,1,1,1,4,1794,3
-4863,clone_94,call_function,clone.default,backward,16,1,1,1,5,1793,3
-4864,mul_436,call_function,mul.Tensor,backward,16,2,2,1,4358,1792,8
-4865,view_900,call_function,view.default,backward,16,1,1,1,4354,1793,2
-4866,view_as_complex_79,call_function,view_as_complex.default,backward,16,1,1,1,4355,1792,6
-4867,_conj_23,call_function,_conj.default,backward,16,1,1,1,4,1793,3
-4868,clone_95,call_function,clone.default,backward,16,1,1,1,5,1792,3
-4869,mul_437,call_function,mul.Tensor,backward,16,2,2,1,4358,1791,8
-4870,view_as_real_78,call_function,view_as_real.default,backward,16,1,1,1,4359,1791,6
-4871,view_901,call_function,view.default,backward,16,1,1,1,4360,1790,6
-4872,convert_element_type_1206,call_function,convert_element_type.default,backward,16,1,1,1,4361,1789,6
-4873,view_as_real_79,call_function,view_as_real.default,backward,16,1,1,1,4359,1790,6
-4874,view_902,call_function,view.default,backward,16,1,1,1,4360,1789,6
-4875,convert_element_type_1207,call_function,convert_element_type.default,backward,16,1,1,1,4361,1788,6
-4876,view_903,call_function,view.default,backward,16,1,1,1,4353,1788,2
-4877,view_904,call_function,view.default,backward,16,1,1,1,4362,1788,5
-4878,view_905,call_function,view.default,backward,16,1,1,1,4362,1787,5
-4879,alias_default_983,call_function,alias.default,backward,16,1,1,2,4354,1787,4
-4880,einsum_default_361,call_function,einsum.default,backward,16,2,2,1,4355,3,5
-4881,permute_687,call_function,permute.default,backward,16,1,1,1,4,1783,3
-4882,einsum_default_362,call_function,einsum.default,backward,16,2,2,1,4356,1782,5
-4883,permute_688,call_function,permute.default,backward,16,1,1,1,4356,2,4
-4884,dtype_cast_361,call_function,dtype_cast.default,backward,16,1,1,1,4357,1,4
-4885,alias_default_1393,call_function,alias.default,backward,16,1,1,0,4358,0,3
-4886,alias_default_984,call_function,alias.default,backward,16,1,1,2,4363,1787,4
-4887,einsum_default_363,call_function,einsum.default,backward,16,2,2,1,4364,3,5
-4888,permute_691,call_function,permute.default,backward,16,1,1,1,4,1783,3
-4889,einsum_default_364,call_function,einsum.default,backward,16,2,2,1,4365,1782,5
-4890,add_222,call_function,add.Tensor,unknown,,2,2,1,4372,1781,10
-4891,permute_692,call_function,permute.default,backward,16,1,1,1,4365,2,4
-4892,dtype_cast_362,call_function,dtype_cast.default,backward,16,1,1,1,4366,1,4
-4893,alias_default_1392,call_function,alias.default,backward,16,1,1,0,4367,0,3
-4894,alias_default_985,call_function,alias.default,backward,16,1,1,2,4363,1786,4
-4895,einsum_default_365,call_function,einsum.default,backward,16,2,2,1,4364,3,5
-4896,permute_695,call_function,permute.default,backward,16,1,1,1,4,1782,3
-4897,einsum_default_366,call_function,einsum.default,backward,16,2,2,1,4365,1781,5
-4898,add_223,call_function,add.Tensor,unknown,,2,2,1,4388,1780,10
-4899,permute_696,call_function,permute.default,backward,16,1,1,1,4365,2,4
-4900,dtype_cast_363,call_function,dtype_cast.default,backward,16,1,1,1,4366,1,4
-4901,alias_default_1391,call_function,alias.default,backward,16,1,1,0,4367,0,3
-4902,convert_element_type_1220,call_function,convert_element_type.default,backward,16,1,1,1,4389,1779,8
-4903,convert_element_type_1221,call_function,convert_element_type.default,backward,16,1,1,1,1769,1779,4
-4904,convert_element_type_1222,call_function,convert_element_type.default,backward,16,1,1,1,3,1773,2
-4905,alias_default_986,call_function,alias.default,backward,16,1,1,2,4390,1778,4
-4906,mul_438,call_function,mul.Tensor,backward,16,2,2,1,4392,1772,8
-4907,mul_439,call_function,mul.Tensor,backward,16,2,2,1,1777,1778,8
-4908,alias_default_987,call_function,alias.default,backward,16,1,1,2,4393,1771,4
-4909,alias_default_988,call_function,alias.default,backward,16,1,1,3,1778,1777,4
-4910,mul_440,call_function,mul.Tensor,backward,16,2,2,1,4397,1770,8
-4911,sum_49,call_function,sum.dim_IntList,backward,16,1,1,1,4398,1769,5
-4912,div_52,call_function,div.Tensor,backward,16,1,1,1,1779,1769,6
-4913,mul_441,call_function,mul.Tensor,backward,16,2,2,1,4400,1768,8
-4914,sub_36,call_function,sub.Tensor,backward,16,2,2,1,4401,1767,10
-4915,mul_442,call_function,mul.Tensor,backward,16,2,2,1,4402,1766,8
-4916,mul_443,call_function,mul.Tensor,backward,16,2,2,1,4394,4,8
-4917,sum_50,call_function,sum.dim_IntList,backward,16,1,1,1,4395,3,5
-4918,convert_element_type_1223,call_function,convert_element_type.default,backward,16,1,1,1,4403,1765,6
-4919,convert_element_type_1224,call_function,convert_element_type.default,backward,16,1,1,1,4396,2,3
-4920,add_224,call_function,add.Tensor,unknown,,2,2,1,4404,1764,10
-4921,dtype_cast_364,call_function,dtype_cast.default,backward,16,1,1,1,4397,1,3
-4922,alias_default_1398,call_function,alias.default,backward,16,1,1,0,4398,0,2
-4923,alias_default_989,call_function,alias.default,unknown,,1,1,3,4405,1763,4
-4924,einsum_default_367,call_function,einsum.default,backward,15,2,2,1,4406,3,5
-4925,permute_699,call_function,permute.default,backward,15,1,1,1,4,1759,3
-4926,einsum_default_368,call_function,einsum.default,backward,15,2,2,1,4407,1758,5
-4927,permute_700,call_function,permute.default,backward,15,1,1,1,4407,2,4
-4928,dtype_cast_365,call_function,dtype_cast.default,backward,15,1,1,1,4408,1,4
-4929,alias_default_1387,call_function,alias.default,backward,15,1,1,0,4409,0,3
-4930,alias_default_990,call_function,alias.default,backward,15,1,1,2,4408,1757,4
-4931,mul_444,call_function,mul.Tensor,backward,15,2,2,1,4409,1745,8
-4932,mul_445,call_function,mul.Tensor,backward,15,2,2,1,4409,1749,8
-4933,alias_default_991,call_function,alias.default,backward,15,1,1,2,4410,1744,4
-4934,einsum_default_369,call_function,einsum.default,backward,15,2,2,1,4411,3,5
-4935,permute_703,call_function,permute.default,backward,15,1,1,1,4,1740,3
-4936,einsum_default_370,call_function,einsum.default,backward,15,2,2,1,4412,1739,5
-4937,permute_704,call_function,permute.default,backward,15,1,1,1,4412,2,4
-4938,dtype_cast_366,call_function,dtype_cast.default,backward,15,1,1,1,4413,1,4
-4939,alias_default_1388,call_function,alias.default,backward,15,1,1,0,4414,0,3
-4940,convert_element_type_1233,call_function,convert_element_type.default,backward,15,1,1,1,4410,1748,6
-4941,convert_element_type_1234,call_function,convert_element_type.default,backward,15,1,1,1,1746,1758,4
-4942,alias_default_992,call_function,alias.default,backward,15,1,1,2,1747,1757,4
-4943,neg_40,call_function,neg.default,backward,15,1,1,1,1748,1756,8
-4944,exp_40,call_function,exp.default,backward,15,1,1,1,1749,1755,6
-4945,add_225,call_function,add.Tensor,backward,15,1,1,1,1750,1754,4
-4946,reciprocal_12,call_function,reciprocal.default,backward,15,1,1,1,1751,1753,4
-4947,mul_446,call_function,mul.Tensor,backward,15,1,1,1,1752,1752,6
-4948,alias_default_993,call_function,alias.default,backward,15,1,1,2,1753,1751,4
-4949,mul_447,call_function,mul.Tensor,backward,15,2,2,1,4419,1747,8
-4950,sub_37,call_function,sub.Tensor,backward,15,1,1,1,1754,1749,4
-4951,mul_448,call_function,mul.Tensor,backward,15,2,2,1,1755,1748,8
-4952,add_226,call_function,add.Tensor,backward,15,1,1,1,1756,1747,4
-4953,mul_449,call_function,mul.Tensor,backward,15,2,2,1,4423,1746,8
-4954,convert_element_type_1235,call_function,convert_element_type.default,backward,15,1,1,1,4424,1745,6
-4955,alias_default_994,call_function,alias.default,backward,15,1,1,2,4425,1744,4
-4956,einsum_default_371,call_function,einsum.default,backward,15,2,2,1,4426,3,5
-4957,permute_707,call_function,permute.default,backward,15,1,1,1,4,1740,3
-4958,einsum_default_372,call_function,einsum.default,backward,15,2,2,1,4427,1739,5
-4959,add_227,call_function,add.Tensor,unknown,,2,2,1,4432,1738,10
-4960,permute_708,call_function,permute.default,backward,15,1,1,1,4427,2,4
-4961,dtype_cast_367,call_function,dtype_cast.default,backward,15,1,1,1,4428,1,4
-4962,alias_default_1386,call_function,alias.default,backward,15,1,1,0,4429,0,3
-4963,convert_element_type_1240,call_function,convert_element_type.default,backward,15,1,1,1,4433,1737,8
-4964,convert_element_type_1241,call_function,convert_element_type.default,backward,15,1,1,1,1726,1737,4
-4965,convert_element_type_1242,call_function,convert_element_type.default,backward,15,1,1,1,3,1731,2
-4966,alias_default_995,call_function,alias.default,backward,15,1,1,2,4434,1736,4
-4967,mul_450,call_function,mul.Tensor,backward,15,2,2,1,4436,1730,8
-4968,mul_451,call_function,mul.Tensor,backward,15,2,2,1,1734,1736,8
-4969,alias_default_996,call_function,alias.default,backward,15,1,1,2,4437,1729,4
-4970,alias_default_997,call_function,alias.default,backward,15,1,1,3,1735,1735,4
-4971,mul_452,call_function,mul.Tensor,backward,15,2,2,1,4441,1728,8
-4972,sum_51,call_function,sum.dim_IntList,backward,15,1,1,1,4442,1727,5
-4973,div_53,call_function,div.Tensor,backward,15,1,1,1,1736,1727,6
-4974,mul_453,call_function,mul.Tensor,backward,15,2,2,1,4444,1726,8
-4975,sub_38,call_function,sub.Tensor,backward,15,2,2,1,4445,1725,10
-4976,mul_454,call_function,mul.Tensor,backward,15,2,2,1,4446,1724,8
-4977,mul_455,call_function,mul.Tensor,backward,15,2,2,1,4438,4,8
-4978,sum_52,call_function,sum.dim_IntList,backward,15,1,1,1,4439,3,5
-4979,convert_element_type_1243,call_function,convert_element_type.default,backward,15,1,1,1,4447,1723,6
-4980,convert_element_type_1244,call_function,convert_element_type.default,backward,15,1,1,1,4440,2,3
-4981,add_228,call_function,add.Tensor,unknown,,2,2,1,4448,1722,10
-4982,dtype_cast_368,call_function,dtype_cast.default,backward,15,1,1,1,4441,1,3
-4983,alias_default_1390,call_function,alias.default,backward,15,1,1,0,4442,0,2
-4984,alias_default_998,call_function,alias.default,unknown,,1,1,3,4449,1721,4
-4985,einsum_default_373,call_function,einsum.default,backward,15,2,2,1,4450,3,5
-4986,permute_711,call_function,permute.default,backward,15,1,1,1,4,1717,3
-4987,einsum_default_374,call_function,einsum.default,backward,15,2,2,1,4451,1716,5
-4988,permute_712,call_function,permute.default,backward,15,1,1,1,4451,2,4
-4989,dtype_cast_369,call_function,dtype_cast.default,backward,15,1,1,1,4452,1,4
-4990,alias_default_1385,call_function,alias.default,backward,15,1,1,0,4453,0,3
-4991,view_920,call_function,view.default,backward,15,1,1,1,4452,1715,4
-4992,permute_713,call_function,permute.default,backward,15,1,1,1,4453,1714,4
-4993,_scaled_dot_product_flash_attention_backward_12,call_function,_scaled_dot_product_flash_attention_backward.default,backward,15,8,8,3,4457,1713,2
-4994,getitem_288,call_function,getitem,backward,15,1,1,1,4458,1686,2
-4995,getitem_289,call_function,getitem,backward,15,1,1,1,4458,1687,2
-4996,getitem_290,call_function,getitem,backward,15,1,1,1,4458,1680,2
-4997,permute_714,call_function,permute.default,backward,15,1,1,1,4459,1679,2
-4998,permute_715,call_function,permute.default,backward,15,1,1,1,4459,1686,2
-4999,permute_716,call_function,permute.default,backward,15,1,1,1,4459,1685,2
-5000,convert_element_type_1249,call_function,convert_element_type.default,backward,15,1,1,1,4460,1685,2
-5001,convert_element_type_1250,call_function,convert_element_type.default,backward,15,1,1,1,4460,1684,2
-5002,view_921,call_function,view.default,backward,15,1,1,1,4461,1684,2
-5003,view_as_complex_80,call_function,view_as_complex.default,backward,15,1,1,1,4462,1683,6
-5004,_conj_24,call_function,_conj.default,backward,15,1,1,1,4,1684,3
-5005,clone_102,call_function,clone.default,backward,15,1,1,1,5,1683,3
-5006,mul_456,call_function,mul.Tensor,backward,15,2,2,1,4465,1682,8
-5007,view_922,call_function,view.default,backward,15,1,1,1,4461,1683,2
-5008,view_as_complex_81,call_function,view_as_complex.default,backward,15,1,1,1,4462,1682,6
-5009,_conj_25,call_function,_conj.default,backward,15,1,1,1,4,1683,3
-5010,clone_103,call_function,clone.default,backward,15,1,1,1,5,1682,3
-5011,mul_457,call_function,mul.Tensor,backward,15,2,2,1,4465,1681,8
-5012,view_as_real_80,call_function,view_as_real.default,backward,15,1,1,1,4466,1681,6
-5013,view_923,call_function,view.default,backward,15,1,1,1,4467,1680,6
-5014,convert_element_type_1251,call_function,convert_element_type.default,backward,15,1,1,1,4468,1679,6
-5015,view_as_real_81,call_function,view_as_real.default,backward,15,1,1,1,4466,1680,6
-5016,view_924,call_function,view.default,backward,15,1,1,1,4467,1679,6
-5017,convert_element_type_1252,call_function,convert_element_type.default,backward,15,1,1,1,4468,1678,6
-5018,view_925,call_function,view.default,backward,15,1,1,1,4460,1678,2
-5019,view_926,call_function,view.default,backward,15,1,1,1,4469,1678,5
-5020,view_927,call_function,view.default,backward,15,1,1,1,4469,1677,5
-5021,alias_default_999,call_function,alias.default,backward,15,1,1,2,4461,1677,4
-5022,einsum_default_375,call_function,einsum.default,backward,15,2,2,1,4462,3,5
-5023,permute_719,call_function,permute.default,backward,15,1,1,1,4,1673,3
-5024,einsum_default_376,call_function,einsum.default,backward,15,2,2,1,4463,1672,5
-5025,permute_720,call_function,permute.default,backward,15,1,1,1,4463,2,4
-5026,dtype_cast_370,call_function,dtype_cast.default,backward,15,1,1,1,4464,1,4
-5027,alias_default_1384,call_function,alias.default,backward,15,1,1,0,4465,0,3
-5028,alias_default_1000,call_function,alias.default,backward,15,1,1,2,4470,1677,4
-5029,einsum_default_377,call_function,einsum.default,backward,15,2,2,1,4471,3,5
-5030,permute_723,call_function,permute.default,backward,15,1,1,1,4,1673,3
-5031,einsum_default_378,call_function,einsum.default,backward,15,2,2,1,4472,1672,5
-5032,add_229,call_function,add.Tensor,unknown,,2,2,1,4479,1671,10
-5033,permute_724,call_function,permute.default,backward,15,1,1,1,4472,2,4
-5034,dtype_cast_371,call_function,dtype_cast.default,backward,15,1,1,1,4473,1,4
-5035,alias_default_1383,call_function,alias.default,backward,15,1,1,0,4474,0,3
-5036,alias_default_1001,call_function,alias.default,backward,15,1,1,2,4470,1676,4
-5037,einsum_default_379,call_function,einsum.default,backward,15,2,2,1,4471,3,5
-5038,permute_727,call_function,permute.default,backward,15,1,1,1,4,1672,3
-5039,einsum_default_380,call_function,einsum.default,backward,15,2,2,1,4472,1671,5
-5040,add_230,call_function,add.Tensor,unknown,,2,2,1,4495,1670,10
-5041,permute_728,call_function,permute.default,backward,15,1,1,1,4472,2,4
-5042,dtype_cast_372,call_function,dtype_cast.default,backward,15,1,1,1,4473,1,4
-5043,alias_default_1382,call_function,alias.default,backward,15,1,1,0,4474,0,3
-5044,convert_element_type_1265,call_function,convert_element_type.default,backward,15,1,1,1,4496,1669,8
-5045,convert_element_type_1266,call_function,convert_element_type.default,backward,15,1,1,1,1659,1669,4
-5046,convert_element_type_1267,call_function,convert_element_type.default,backward,15,1,1,1,3,1663,2
-5047,alias_default_1002,call_function,alias.default,backward,15,1,1,2,4497,1668,4
-5048,mul_458,call_function,mul.Tensor,backward,15,2,2,1,4499,1662,8
-5049,mul_459,call_function,mul.Tensor,backward,15,2,2,1,1667,1668,8
-5050,alias_default_1003,call_function,alias.default,backward,15,1,1,2,4500,1661,4
-5051,alias_default_1004,call_function,alias.default,backward,15,1,1,3,1668,1667,4
-5052,mul_460,call_function,mul.Tensor,backward,15,2,2,1,4504,1660,8
-5053,sum_53,call_function,sum.dim_IntList,backward,15,1,1,1,4505,1659,5
-5054,div_54,call_function,div.Tensor,backward,15,1,1,1,1669,1659,6
-5055,mul_461,call_function,mul.Tensor,backward,15,2,2,1,4507,1658,8
-5056,sub_39,call_function,sub.Tensor,backward,15,2,2,1,4508,1657,10
-5057,mul_462,call_function,mul.Tensor,backward,15,2,2,1,4509,1656,8
-5058,mul_463,call_function,mul.Tensor,backward,15,2,2,1,4501,4,8
-5059,sum_54,call_function,sum.dim_IntList,backward,15,1,1,1,4502,3,5
-5060,convert_element_type_1268,call_function,convert_element_type.default,backward,15,1,1,1,4510,1655,6
-5061,convert_element_type_1269,call_function,convert_element_type.default,backward,15,1,1,1,4503,2,3
-5062,add_231,call_function,add.Tensor,unknown,,2,2,1,4511,1654,10
-5063,dtype_cast_373,call_function,dtype_cast.default,backward,15,1,1,1,4504,1,3
-5064,alias_default_1389,call_function,alias.default,backward,15,1,1,0,4505,0,2
-5065,alias_default_1005,call_function,alias.default,unknown,,1,1,3,4512,1653,4
-5066,einsum_default_381,call_function,einsum.default,backward,14,2,2,1,4513,3,5
-5067,permute_731,call_function,permute.default,backward,14,1,1,1,4,1649,3
-5068,einsum_default_382,call_function,einsum.default,backward,14,2,2,1,4514,1648,5
-5069,permute_732,call_function,permute.default,backward,14,1,1,1,4514,2,4
-5070,dtype_cast_374,call_function,dtype_cast.default,backward,14,1,1,1,4515,1,4
-5071,alias_default_1378,call_function,alias.default,backward,14,1,1,0,4516,0,3
-5072,alias_default_1006,call_function,alias.default,backward,14,1,1,2,4515,1647,4
-5073,mul_464,call_function,mul.Tensor,backward,14,2,2,1,4516,1635,8
-5074,mul_465,call_function,mul.Tensor,backward,14,2,2,1,4516,1639,8
-5075,alias_default_1007,call_function,alias.default,backward,14,1,1,2,4517,1634,4
-5076,einsum_default_383,call_function,einsum.default,backward,14,2,2,1,4518,3,5
-5077,permute_735,call_function,permute.default,backward,14,1,1,1,4,1630,3
-5078,einsum_default_384,call_function,einsum.default,backward,14,2,2,1,4519,1629,5
-5079,permute_736,call_function,permute.default,backward,14,1,1,1,4519,2,4
-5080,dtype_cast_375,call_function,dtype_cast.default,backward,14,1,1,1,4520,1,4
-5081,alias_default_1379,call_function,alias.default,backward,14,1,1,0,4521,0,3
-5082,convert_element_type_1278,call_function,convert_element_type.default,backward,14,1,1,1,4517,1638,6
-5083,convert_element_type_1279,call_function,convert_element_type.default,backward,14,1,1,1,1636,1648,4
-5084,alias_default_1008,call_function,alias.default,backward,14,1,1,2,1637,1647,4
-5085,neg_41,call_function,neg.default,backward,14,1,1,1,1638,1646,8
-5086,exp_41,call_function,exp.default,backward,14,1,1,1,1639,1645,6
-5087,add_232,call_function,add.Tensor,backward,14,1,1,1,1640,1644,4
-5088,reciprocal_13,call_function,reciprocal.default,backward,14,1,1,1,1641,1643,4
-5089,mul_466,call_function,mul.Tensor,backward,14,1,1,1,1642,1642,6
-5090,alias_default_1009,call_function,alias.default,backward,14,1,1,2,1643,1641,4
-5091,mul_467,call_function,mul.Tensor,backward,14,2,2,1,4526,1637,8
-5092,sub_40,call_function,sub.Tensor,backward,14,1,1,1,1644,1639,4
-5093,mul_468,call_function,mul.Tensor,backward,14,2,2,1,1645,1638,8
-5094,add_233,call_function,add.Tensor,backward,14,1,1,1,1646,1637,4
-5095,mul_469,call_function,mul.Tensor,backward,14,2,2,1,4530,1636,8
-5096,convert_element_type_1280,call_function,convert_element_type.default,backward,14,1,1,1,4531,1635,6
-5097,alias_default_1010,call_function,alias.default,backward,14,1,1,2,4532,1634,4
-5098,einsum_default_385,call_function,einsum.default,backward,14,2,2,1,4533,3,5
-5099,permute_739,call_function,permute.default,backward,14,1,1,1,4,1630,3
-5100,einsum_default_386,call_function,einsum.default,backward,14,2,2,1,4534,1629,5
-5101,add_234,call_function,add.Tensor,unknown,,2,2,1,4539,1628,10
-5102,permute_740,call_function,permute.default,backward,14,1,1,1,4534,2,4
-5103,dtype_cast_376,call_function,dtype_cast.default,backward,14,1,1,1,4535,1,4
-5104,alias_default_1377,call_function,alias.default,backward,14,1,1,0,4536,0,3
-5105,convert_element_type_1285,call_function,convert_element_type.default,backward,14,1,1,1,4540,1627,8
-5106,convert_element_type_1286,call_function,convert_element_type.default,backward,14,1,1,1,1616,1627,4
-5107,convert_element_type_1287,call_function,convert_element_type.default,backward,14,1,1,1,3,1621,2
-5108,alias_default_1011,call_function,alias.default,backward,14,1,1,2,4541,1626,4
-5109,mul_470,call_function,mul.Tensor,backward,14,2,2,1,4543,1620,8
-5110,mul_471,call_function,mul.Tensor,backward,14,2,2,1,1624,1626,8
-5111,alias_default_1012,call_function,alias.default,backward,14,1,1,2,4544,1619,4
-5112,alias_default_1013,call_function,alias.default,backward,14,1,1,3,1625,1625,4
-5113,mul_472,call_function,mul.Tensor,backward,14,2,2,1,4548,1618,8
-5114,sum_55,call_function,sum.dim_IntList,backward,14,1,1,1,4549,1617,5
-5115,div_55,call_function,div.Tensor,backward,14,1,1,1,1626,1617,6
-5116,mul_473,call_function,mul.Tensor,backward,14,2,2,1,4551,1616,8
-5117,sub_41,call_function,sub.Tensor,backward,14,2,2,1,4552,1615,10
-5118,mul_474,call_function,mul.Tensor,backward,14,2,2,1,4553,1614,8
-5119,mul_475,call_function,mul.Tensor,backward,14,2,2,1,4545,4,8
-5120,sum_56,call_function,sum.dim_IntList,backward,14,1,1,1,4546,3,5
-5121,convert_element_type_1288,call_function,convert_element_type.default,backward,14,1,1,1,4554,1613,6
-5122,convert_element_type_1289,call_function,convert_element_type.default,backward,14,1,1,1,4547,2,3
-5123,add_235,call_function,add.Tensor,unknown,,2,2,1,4555,1612,10
-5124,dtype_cast_377,call_function,dtype_cast.default,backward,14,1,1,1,4548,1,3
-5125,alias_default_1381,call_function,alias.default,backward,14,1,1,0,4549,0,2
-5126,alias_default_1014,call_function,alias.default,unknown,,1,1,3,4556,1611,4
-5127,einsum_default_387,call_function,einsum.default,backward,14,2,2,1,4557,3,5
-5128,permute_743,call_function,permute.default,backward,14,1,1,1,4,1607,3
-5129,einsum_default_388,call_function,einsum.default,backward,14,2,2,1,4558,1606,5
-5130,permute_744,call_function,permute.default,backward,14,1,1,1,4558,2,4
-5131,dtype_cast_378,call_function,dtype_cast.default,backward,14,1,1,1,4559,1,4
-5132,alias_default_1376,call_function,alias.default,backward,14,1,1,0,4560,0,3
-5133,view_942,call_function,view.default,backward,14,1,1,1,4559,1605,4
-5134,permute_745,call_function,permute.default,backward,14,1,1,1,4560,1604,4
-5135,_scaled_dot_product_flash_attention_backward_13,call_function,_scaled_dot_product_flash_attention_backward.default,backward,14,8,8,3,4564,1603,2
-5136,getitem_291,call_function,getitem,backward,14,1,1,1,4565,1576,2
-5137,getitem_292,call_function,getitem,backward,14,1,1,1,4565,1577,2
-5138,getitem_293,call_function,getitem,backward,14,1,1,1,4565,1570,2
-5139,permute_746,call_function,permute.default,backward,14,1,1,1,4566,1569,2
-5140,permute_747,call_function,permute.default,backward,14,1,1,1,4566,1576,2
-5141,permute_748,call_function,permute.default,backward,14,1,1,1,4566,1575,2
-5142,convert_element_type_1294,call_function,convert_element_type.default,backward,14,1,1,1,4567,1575,2
-5143,convert_element_type_1295,call_function,convert_element_type.default,backward,14,1,1,1,4567,1574,2
-5144,view_943,call_function,view.default,backward,14,1,1,1,4568,1574,2
-5145,view_as_complex_82,call_function,view_as_complex.default,backward,14,1,1,1,4569,1573,6
-5146,_conj_26,call_function,_conj.default,backward,14,1,1,1,4,1574,3
-5147,clone_110,call_function,clone.default,backward,14,1,1,1,5,1573,3
-5148,mul_476,call_function,mul.Tensor,backward,14,2,2,1,4572,1572,8
-5149,view_944,call_function,view.default,backward,14,1,1,1,4568,1573,2
-5150,view_as_complex_83,call_function,view_as_complex.default,backward,14,1,1,1,4569,1572,6
-5151,_conj_27,call_function,_conj.default,backward,14,1,1,1,4,1573,3
-5152,clone_111,call_function,clone.default,backward,14,1,1,1,5,1572,3
-5153,mul_477,call_function,mul.Tensor,backward,14,2,2,1,4572,1571,8
-5154,view_as_real_82,call_function,view_as_real.default,backward,14,1,1,1,4573,1571,6
-5155,view_945,call_function,view.default,backward,14,1,1,1,4574,1570,6
-5156,convert_element_type_1296,call_function,convert_element_type.default,backward,14,1,1,1,4575,1569,6
-5157,view_as_real_83,call_function,view_as_real.default,backward,14,1,1,1,4573,1570,6
-5158,view_946,call_function,view.default,backward,14,1,1,1,4574,1569,6
-5159,convert_element_type_1297,call_function,convert_element_type.default,backward,14,1,1,1,4575,1568,6
-5160,view_947,call_function,view.default,backward,14,1,1,1,4567,1568,2
-5161,view_948,call_function,view.default,backward,14,1,1,1,4576,1568,5
-5162,view_949,call_function,view.default,backward,14,1,1,1,4576,1567,5
-5163,alias_default_1015,call_function,alias.default,backward,14,1,1,2,4568,1567,4
-5164,einsum_default_389,call_function,einsum.default,backward,14,2,2,1,4569,3,5
-5165,permute_751,call_function,permute.default,backward,14,1,1,1,4,1563,3
-5166,einsum_default_390,call_function,einsum.default,backward,14,2,2,1,4570,1562,5
-5167,permute_752,call_function,permute.default,backward,14,1,1,1,4570,2,4
-5168,dtype_cast_379,call_function,dtype_cast.default,backward,14,1,1,1,4571,1,4
-5169,alias_default_1375,call_function,alias.default,backward,14,1,1,0,4572,0,3
-5170,alias_default_1016,call_function,alias.default,backward,14,1,1,2,4577,1567,4
-5171,einsum_default_391,call_function,einsum.default,backward,14,2,2,1,4578,3,5
-5172,permute_755,call_function,permute.default,backward,14,1,1,1,4,1563,3
-5173,einsum_default_392,call_function,einsum.default,backward,14,2,2,1,4579,1562,5
-5174,add_236,call_function,add.Tensor,unknown,,2,2,1,4586,1561,10
-5175,permute_756,call_function,permute.default,backward,14,1,1,1,4579,2,4
-5176,dtype_cast_380,call_function,dtype_cast.default,backward,14,1,1,1,4580,1,4
-5177,alias_default_1374,call_function,alias.default,backward,14,1,1,0,4581,0,3
-5178,alias_default_1017,call_function,alias.default,backward,14,1,1,2,4577,1566,4
-5179,einsum_default_393,call_function,einsum.default,backward,14,2,2,1,4578,3,5
-5180,permute_759,call_function,permute.default,backward,14,1,1,1,4,1562,3
-5181,einsum_default_394,call_function,einsum.default,backward,14,2,2,1,4579,1561,5
-5182,add_237,call_function,add.Tensor,unknown,,2,2,1,4602,1560,10
-5183,permute_760,call_function,permute.default,backward,14,1,1,1,4579,2,4
-5184,dtype_cast_381,call_function,dtype_cast.default,backward,14,1,1,1,4580,1,4
-5185,alias_default_1373,call_function,alias.default,backward,14,1,1,0,4581,0,3
-5186,convert_element_type_1310,call_function,convert_element_type.default,backward,14,1,1,1,4603,1559,8
-5187,convert_element_type_1311,call_function,convert_element_type.default,backward,14,1,1,1,1549,1559,4
-5188,convert_element_type_1312,call_function,convert_element_type.default,backward,14,1,1,1,3,1553,2
-5189,alias_default_1018,call_function,alias.default,backward,14,1,1,2,4604,1558,4
-5190,mul_478,call_function,mul.Tensor,backward,14,2,2,1,4606,1552,8
-5191,mul_479,call_function,mul.Tensor,backward,14,2,2,1,1557,1558,8
-5192,alias_default_1019,call_function,alias.default,backward,14,1,1,2,4607,1551,4
-5193,alias_default_1020,call_function,alias.default,backward,14,1,1,3,1558,1557,4
-5194,mul_480,call_function,mul.Tensor,backward,14,2,2,1,4611,1550,8
-5195,sum_57,call_function,sum.dim_IntList,backward,14,1,1,1,4612,1549,5
-5196,div_56,call_function,div.Tensor,backward,14,1,1,1,1559,1549,6
-5197,mul_481,call_function,mul.Tensor,backward,14,2,2,1,4614,1548,8
-5198,sub_42,call_function,sub.Tensor,backward,14,2,2,1,4615,1547,10
-5199,mul_482,call_function,mul.Tensor,backward,14,2,2,1,4616,1546,8
-5200,mul_483,call_function,mul.Tensor,backward,14,2,2,1,4608,4,8
-5201,sum_58,call_function,sum.dim_IntList,backward,14,1,1,1,4609,3,5
-5202,convert_element_type_1313,call_function,convert_element_type.default,backward,14,1,1,1,4617,1545,6
-5203,convert_element_type_1314,call_function,convert_element_type.default,backward,14,1,1,1,4610,2,3
-5204,add_238,call_function,add.Tensor,unknown,,2,2,1,4618,1544,10
-5205,dtype_cast_382,call_function,dtype_cast.default,backward,14,1,1,1,4611,1,3
-5206,alias_default_1380,call_function,alias.default,backward,14,1,1,0,4612,0,2
-5207,alias_default_1021,call_function,alias.default,unknown,,1,1,3,4619,1543,4
-5208,einsum_default_395,call_function,einsum.default,backward,13,2,2,1,4620,3,5
-5209,permute_763,call_function,permute.default,backward,13,1,1,1,4,1539,3
-5210,einsum_default_396,call_function,einsum.default,backward,13,2,2,1,4621,1538,5
-5211,permute_764,call_function,permute.default,backward,13,1,1,1,4621,2,4
-5212,dtype_cast_383,call_function,dtype_cast.default,backward,13,1,1,1,4622,1,4
-5213,alias_default_1369,call_function,alias.default,backward,13,1,1,0,4623,0,3
-5214,alias_default_1022,call_function,alias.default,backward,13,1,1,2,4622,1537,4
-5215,mul_484,call_function,mul.Tensor,backward,13,2,2,1,4623,1525,8
-5216,mul_485,call_function,mul.Tensor,backward,13,2,2,1,4623,1529,8
-5217,alias_default_1023,call_function,alias.default,backward,13,1,1,2,4624,1524,4
-5218,einsum_default_397,call_function,einsum.default,backward,13,2,2,1,4625,3,5
-5219,permute_767,call_function,permute.default,backward,13,1,1,1,4,1520,3
-5220,einsum_default_398,call_function,einsum.default,backward,13,2,2,1,4626,1519,5
-5221,permute_768,call_function,permute.default,backward,13,1,1,1,4626,2,4
-5222,dtype_cast_384,call_function,dtype_cast.default,backward,13,1,1,1,4627,1,4
-5223,alias_default_1370,call_function,alias.default,backward,13,1,1,0,4628,0,3
-5224,convert_element_type_1323,call_function,convert_element_type.default,backward,13,1,1,1,4624,1528,6
-5225,convert_element_type_1324,call_function,convert_element_type.default,backward,13,1,1,1,1526,1538,4
-5226,alias_default_1024,call_function,alias.default,backward,13,1,1,2,1527,1537,4
-5227,neg_42,call_function,neg.default,backward,13,1,1,1,1528,1536,8
-5228,exp_42,call_function,exp.default,backward,13,1,1,1,1529,1535,6
-5229,add_239,call_function,add.Tensor,backward,13,1,1,1,1530,1534,4
-5230,reciprocal_14,call_function,reciprocal.default,backward,13,1,1,1,1531,1533,4
-5231,mul_486,call_function,mul.Tensor,backward,13,1,1,1,1532,1532,6
-5232,alias_default_1025,call_function,alias.default,backward,13,1,1,2,1533,1531,4
-5233,mul_487,call_function,mul.Tensor,backward,13,2,2,1,4633,1527,8
-5234,sub_43,call_function,sub.Tensor,backward,13,1,1,1,1534,1529,4
-5235,mul_488,call_function,mul.Tensor,backward,13,2,2,1,1535,1528,8
-5236,add_240,call_function,add.Tensor,backward,13,1,1,1,1536,1527,4
-5237,mul_489,call_function,mul.Tensor,backward,13,2,2,1,4637,1526,8
-5238,convert_element_type_1325,call_function,convert_element_type.default,backward,13,1,1,1,4638,1525,6
-5239,alias_default_1026,call_function,alias.default,backward,13,1,1,2,4639,1524,4
-5240,einsum_default_399,call_function,einsum.default,backward,13,2,2,1,4640,3,5
-5241,permute_771,call_function,permute.default,backward,13,1,1,1,4,1520,3
-5242,einsum_default_400,call_function,einsum.default,backward,13,2,2,1,4641,1519,5
-5243,add_241,call_function,add.Tensor,unknown,,2,2,1,4646,1518,10
-5244,permute_772,call_function,permute.default,backward,13,1,1,1,4641,2,4
-5245,dtype_cast_385,call_function,dtype_cast.default,backward,13,1,1,1,4642,1,4
-5246,alias_default_1368,call_function,alias.default,backward,13,1,1,0,4643,0,3
-5247,convert_element_type_1330,call_function,convert_element_type.default,backward,13,1,1,1,4647,1517,8
-5248,convert_element_type_1331,call_function,convert_element_type.default,backward,13,1,1,1,1506,1517,4
-5249,convert_element_type_1332,call_function,convert_element_type.default,backward,13,1,1,1,3,1511,2
-5250,alias_default_1027,call_function,alias.default,backward,13,1,1,2,4648,1516,4
-5251,mul_490,call_function,mul.Tensor,backward,13,2,2,1,4650,1510,8
-5252,mul_491,call_function,mul.Tensor,backward,13,2,2,1,1514,1516,8
-5253,alias_default_1028,call_function,alias.default,backward,13,1,1,2,4651,1509,4
-5254,alias_default_1029,call_function,alias.default,backward,13,1,1,3,1515,1515,4
-5255,mul_492,call_function,mul.Tensor,backward,13,2,2,1,4655,1508,8
-5256,sum_59,call_function,sum.dim_IntList,backward,13,1,1,1,4656,1507,5
-5257,div_57,call_function,div.Tensor,backward,13,1,1,1,1516,1507,6
-5258,mul_493,call_function,mul.Tensor,backward,13,2,2,1,4658,1506,8
-5259,sub_44,call_function,sub.Tensor,backward,13,2,2,1,4659,1505,10
-5260,mul_494,call_function,mul.Tensor,backward,13,2,2,1,4660,1504,8
-5261,mul_495,call_function,mul.Tensor,backward,13,2,2,1,4652,4,8
-5262,sum_60,call_function,sum.dim_IntList,backward,13,1,1,1,4653,3,5
-5263,convert_element_type_1333,call_function,convert_element_type.default,backward,13,1,1,1,4661,1503,6
-5264,convert_element_type_1334,call_function,convert_element_type.default,backward,13,1,1,1,4654,2,3
-5265,add_242,call_function,add.Tensor,unknown,,2,2,1,4662,1502,10
-5266,dtype_cast_386,call_function,dtype_cast.default,backward,13,1,1,1,4655,1,3
-5267,alias_default_1372,call_function,alias.default,backward,13,1,1,0,4656,0,2
-5268,alias_default_1030,call_function,alias.default,unknown,,1,1,3,4663,1501,4
-5269,einsum_default_401,call_function,einsum.default,backward,13,2,2,1,4664,3,5
-5270,permute_775,call_function,permute.default,backward,13,1,1,1,4,1497,3
-5271,einsum_default_402,call_function,einsum.default,backward,13,2,2,1,4665,1496,5
-5272,permute_776,call_function,permute.default,backward,13,1,1,1,4665,2,4
-5273,dtype_cast_387,call_function,dtype_cast.default,backward,13,1,1,1,4666,1,4
-5274,alias_default_1367,call_function,alias.default,backward,13,1,1,0,4667,0,3
-5275,view_964,call_function,view.default,backward,13,1,1,1,4666,1495,4
-5276,permute_777,call_function,permute.default,backward,13,1,1,1,4667,1494,4
-5277,_scaled_dot_product_flash_attention_backward_14,call_function,_scaled_dot_product_flash_attention_backward.default,backward,13,8,8,3,4671,1493,2
-5278,getitem_294,call_function,getitem,backward,13,1,1,1,4672,1466,2
-5279,getitem_295,call_function,getitem,backward,13,1,1,1,4672,1467,2
-5280,getitem_296,call_function,getitem,backward,13,1,1,1,4672,1460,2
-5281,permute_778,call_function,permute.default,backward,13,1,1,1,4673,1459,2
-5282,permute_779,call_function,permute.default,backward,13,1,1,1,4673,1466,2
-5283,permute_780,call_function,permute.default,backward,13,1,1,1,4673,1465,2
-5284,convert_element_type_1339,call_function,convert_element_type.default,backward,13,1,1,1,4674,1465,2
-5285,convert_element_type_1340,call_function,convert_element_type.default,backward,13,1,1,1,4674,1464,2
-5286,view_965,call_function,view.default,backward,13,1,1,1,4675,1464,2
-5287,view_as_complex_84,call_function,view_as_complex.default,backward,13,1,1,1,4676,1463,6
-5288,_conj_28,call_function,_conj.default,backward,13,1,1,1,4,1464,3
-5289,clone_118,call_function,clone.default,backward,13,1,1,1,5,1463,3
-5290,mul_496,call_function,mul.Tensor,backward,13,2,2,1,4679,1462,8
-5291,view_966,call_function,view.default,backward,13,1,1,1,4675,1463,2
-5292,view_as_complex_85,call_function,view_as_complex.default,backward,13,1,1,1,4676,1462,6
-5293,_conj_29,call_function,_conj.default,backward,13,1,1,1,4,1463,3
-5294,clone_119,call_function,clone.default,backward,13,1,1,1,5,1462,3
-5295,mul_497,call_function,mul.Tensor,backward,13,2,2,1,4679,1461,8
-5296,view_as_real_84,call_function,view_as_real.default,backward,13,1,1,1,4680,1461,6
-5297,view_967,call_function,view.default,backward,13,1,1,1,4681,1460,6
-5298,convert_element_type_1341,call_function,convert_element_type.default,backward,13,1,1,1,4682,1459,6
-5299,view_as_real_85,call_function,view_as_real.default,backward,13,1,1,1,4680,1460,6
-5300,view_968,call_function,view.default,backward,13,1,1,1,4681,1459,6
-5301,convert_element_type_1342,call_function,convert_element_type.default,backward,13,1,1,1,4682,1458,6
-5302,view_969,call_function,view.default,backward,13,1,1,1,4674,1458,2
-5303,view_970,call_function,view.default,backward,13,1,1,1,4683,1458,5
-5304,view_971,call_function,view.default,backward,13,1,1,1,4683,1457,5
-5305,alias_default_1031,call_function,alias.default,backward,13,1,1,2,4675,1457,4
-5306,einsum_default_403,call_function,einsum.default,backward,13,2,2,1,4676,3,5
-5307,permute_783,call_function,permute.default,backward,13,1,1,1,4,1453,3
-5308,einsum_default_404,call_function,einsum.default,backward,13,2,2,1,4677,1452,5
-5309,permute_784,call_function,permute.default,backward,13,1,1,1,4677,2,4
-5310,dtype_cast_388,call_function,dtype_cast.default,backward,13,1,1,1,4678,1,4
-5311,alias_default_1366,call_function,alias.default,backward,13,1,1,0,4679,0,3
-5312,alias_default_1032,call_function,alias.default,backward,13,1,1,2,4684,1457,4
-5313,einsum_default_405,call_function,einsum.default,backward,13,2,2,1,4685,3,5
-5314,permute_787,call_function,permute.default,backward,13,1,1,1,4,1453,3
-5315,einsum_default_406,call_function,einsum.default,backward,13,2,2,1,4686,1452,5
-5316,add_243,call_function,add.Tensor,unknown,,2,2,1,4693,1451,10
-5317,permute_788,call_function,permute.default,backward,13,1,1,1,4686,2,4
-5318,dtype_cast_389,call_function,dtype_cast.default,backward,13,1,1,1,4687,1,4
-5319,alias_default_1365,call_function,alias.default,backward,13,1,1,0,4688,0,3
-5320,alias_default_1033,call_function,alias.default,backward,13,1,1,2,4684,1456,4
-5321,einsum_default_407,call_function,einsum.default,backward,13,2,2,1,4685,3,5
-5322,permute_791,call_function,permute.default,backward,13,1,1,1,4,1452,3
-5323,einsum_default_408,call_function,einsum.default,backward,13,2,2,1,4686,1451,5
-5324,add_244,call_function,add.Tensor,unknown,,2,2,1,4709,1450,10
-5325,permute_792,call_function,permute.default,backward,13,1,1,1,4686,2,4
-5326,dtype_cast_390,call_function,dtype_cast.default,backward,13,1,1,1,4687,1,4
-5327,alias_default_1364,call_function,alias.default,backward,13,1,1,0,4688,0,3
-5328,convert_element_type_1355,call_function,convert_element_type.default,backward,13,1,1,1,4710,1449,8
-5329,convert_element_type_1356,call_function,convert_element_type.default,backward,13,1,1,1,1439,1449,4
-5330,convert_element_type_1357,call_function,convert_element_type.default,backward,13,1,1,1,3,1443,2
-5331,alias_default_1034,call_function,alias.default,backward,13,1,1,2,4711,1448,4
-5332,mul_498,call_function,mul.Tensor,backward,13,2,2,1,4713,1442,8
-5333,mul_499,call_function,mul.Tensor,backward,13,2,2,1,1447,1448,8
-5334,alias_default_1035,call_function,alias.default,backward,13,1,1,2,4714,1441,4
-5335,alias_default_1036,call_function,alias.default,backward,13,1,1,3,1448,1447,4
-5336,mul_500,call_function,mul.Tensor,backward,13,2,2,1,4718,1440,8
-5337,sum_61,call_function,sum.dim_IntList,backward,13,1,1,1,4719,1439,5
-5338,div_58,call_function,div.Tensor,backward,13,1,1,1,1449,1439,6
-5339,mul_501,call_function,mul.Tensor,backward,13,2,2,1,4721,1438,8
-5340,sub_45,call_function,sub.Tensor,backward,13,2,2,1,4722,1437,10
-5341,mul_502,call_function,mul.Tensor,backward,13,2,2,1,4723,1436,8
-5342,mul_503,call_function,mul.Tensor,backward,13,2,2,1,4715,4,8
-5343,sum_62,call_function,sum.dim_IntList,backward,13,1,1,1,4716,3,5
-5344,convert_element_type_1358,call_function,convert_element_type.default,backward,13,1,1,1,4724,1435,6
-5345,convert_element_type_1359,call_function,convert_element_type.default,backward,13,1,1,1,4717,2,3
-5346,add_245,call_function,add.Tensor,unknown,,2,2,1,4725,1434,10
-5347,dtype_cast_391,call_function,dtype_cast.default,backward,13,1,1,1,4718,1,3
-5348,alias_default_1371,call_function,alias.default,backward,13,1,1,0,4719,0,2
-5349,alias_default_1037,call_function,alias.default,unknown,,1,1,3,4726,1433,4
-5350,einsum_default_409,call_function,einsum.default,backward,12,2,2,1,4727,3,5
-5351,permute_795,call_function,permute.default,backward,12,1,1,1,4,1429,3
-5352,einsum_default_410,call_function,einsum.default,backward,12,2,2,1,4728,1428,5
-5353,permute_796,call_function,permute.default,backward,12,1,1,1,4728,2,4
-5354,dtype_cast_392,call_function,dtype_cast.default,backward,12,1,1,1,4729,1,4
-5355,alias_default_1360,call_function,alias.default,backward,12,1,1,0,4730,0,3
-5356,alias_default_1038,call_function,alias.default,backward,12,1,1,2,4729,1427,4
-5357,mul_504,call_function,mul.Tensor,backward,12,2,2,1,4730,1415,8
-5358,mul_505,call_function,mul.Tensor,backward,12,2,2,1,4730,1419,8
-5359,alias_default_1039,call_function,alias.default,backward,12,1,1,2,4731,1414,4
-5360,einsum_default_411,call_function,einsum.default,backward,12,2,2,1,4732,3,5
-5361,permute_799,call_function,permute.default,backward,12,1,1,1,4,1410,3
-5362,einsum_default_412,call_function,einsum.default,backward,12,2,2,1,4733,1409,5
-5363,permute_800,call_function,permute.default,backward,12,1,1,1,4733,2,4
-5364,dtype_cast_393,call_function,dtype_cast.default,backward,12,1,1,1,4734,1,4
-5365,alias_default_1361,call_function,alias.default,backward,12,1,1,0,4735,0,3
-5366,convert_element_type_1368,call_function,convert_element_type.default,backward,12,1,1,1,4731,1418,6
-5367,convert_element_type_1369,call_function,convert_element_type.default,backward,12,1,1,1,1416,1428,4
-5368,alias_default_1040,call_function,alias.default,backward,12,1,1,2,1417,1427,4
-5369,neg_43,call_function,neg.default,backward,12,1,1,1,1418,1426,8
-5370,exp_43,call_function,exp.default,backward,12,1,1,1,1419,1425,6
-5371,add_246,call_function,add.Tensor,backward,12,1,1,1,1420,1424,4
-5372,reciprocal_15,call_function,reciprocal.default,backward,12,1,1,1,1421,1423,4
-5373,mul_506,call_function,mul.Tensor,backward,12,1,1,1,1422,1422,6
-5374,alias_default_1041,call_function,alias.default,backward,12,1,1,2,1423,1421,4
-5375,mul_507,call_function,mul.Tensor,backward,12,2,2,1,4740,1417,8
-5376,sub_46,call_function,sub.Tensor,backward,12,1,1,1,1424,1419,4
-5377,mul_508,call_function,mul.Tensor,backward,12,2,2,1,1425,1418,8
-5378,add_247,call_function,add.Tensor,backward,12,1,1,1,1426,1417,4
-5379,mul_509,call_function,mul.Tensor,backward,12,2,2,1,4744,1416,8
-5380,convert_element_type_1370,call_function,convert_element_type.default,backward,12,1,1,1,4745,1415,6
-5381,alias_default_1042,call_function,alias.default,backward,12,1,1,2,4746,1414,4
-5382,einsum_default_413,call_function,einsum.default,backward,12,2,2,1,4747,3,5
-5383,permute_803,call_function,permute.default,backward,12,1,1,1,4,1410,3
-5384,einsum_default_414,call_function,einsum.default,backward,12,2,2,1,4748,1409,5
-5385,add_248,call_function,add.Tensor,unknown,,2,2,1,4753,1408,10
-5386,permute_804,call_function,permute.default,backward,12,1,1,1,4748,2,4
-5387,dtype_cast_394,call_function,dtype_cast.default,backward,12,1,1,1,4749,1,4
-5388,alias_default_1359,call_function,alias.default,backward,12,1,1,0,4750,0,3
-5389,convert_element_type_1375,call_function,convert_element_type.default,backward,12,1,1,1,4754,1407,8
-5390,convert_element_type_1376,call_function,convert_element_type.default,backward,12,1,1,1,1396,1407,4
-5391,convert_element_type_1377,call_function,convert_element_type.default,backward,12,1,1,1,3,1401,2
-5392,alias_default_1043,call_function,alias.default,backward,12,1,1,2,4755,1406,4
-5393,mul_510,call_function,mul.Tensor,backward,12,2,2,1,4757,1400,8
-5394,mul_511,call_function,mul.Tensor,backward,12,2,2,1,1404,1406,8
-5395,alias_default_1044,call_function,alias.default,backward,12,1,1,2,4758,1399,4
-5396,alias_default_1045,call_function,alias.default,backward,12,1,1,3,1405,1405,4
-5397,mul_512,call_function,mul.Tensor,backward,12,2,2,1,4762,1398,8
-5398,sum_63,call_function,sum.dim_IntList,backward,12,1,1,1,4763,1397,5
-5399,div_59,call_function,div.Tensor,backward,12,1,1,1,1406,1397,6
-5400,mul_513,call_function,mul.Tensor,backward,12,2,2,1,4765,1396,8
-5401,sub_47,call_function,sub.Tensor,backward,12,2,2,1,4766,1395,10
-5402,mul_514,call_function,mul.Tensor,backward,12,2,2,1,4767,1394,8
-5403,mul_515,call_function,mul.Tensor,backward,12,2,2,1,4759,4,8
-5404,sum_64,call_function,sum.dim_IntList,backward,12,1,1,1,4760,3,5
-5405,convert_element_type_1378,call_function,convert_element_type.default,backward,12,1,1,1,4768,1393,6
-5406,convert_element_type_1379,call_function,convert_element_type.default,backward,12,1,1,1,4761,2,3
-5407,add_249,call_function,add.Tensor,unknown,,2,2,1,4769,1392,10
-5408,dtype_cast_395,call_function,dtype_cast.default,backward,12,1,1,1,4762,1,3
-5409,alias_default_1363,call_function,alias.default,backward,12,1,1,0,4763,0,2
-5410,alias_default_1046,call_function,alias.default,unknown,,1,1,3,4770,1391,4
-5411,einsum_default_415,call_function,einsum.default,backward,12,2,2,1,4771,3,5
-5412,permute_807,call_function,permute.default,backward,12,1,1,1,4,1387,3
-5413,einsum_default_416,call_function,einsum.default,backward,12,2,2,1,4772,1386,5
-5414,permute_808,call_function,permute.default,backward,12,1,1,1,4772,2,4
-5415,dtype_cast_396,call_function,dtype_cast.default,backward,12,1,1,1,4773,1,4
-5416,alias_default_1358,call_function,alias.default,backward,12,1,1,0,4774,0,3
-5417,view_986,call_function,view.default,backward,12,1,1,1,4773,1385,4
-5418,permute_809,call_function,permute.default,backward,12,1,1,1,4774,1384,4
-5419,_scaled_dot_product_flash_attention_backward_15,call_function,_scaled_dot_product_flash_attention_backward.default,backward,12,8,8,3,4778,1383,2
-5420,getitem_297,call_function,getitem,backward,12,1,1,1,4779,1356,2
-5421,getitem_298,call_function,getitem,backward,12,1,1,1,4779,1357,2
-5422,getitem_299,call_function,getitem,backward,12,1,1,1,4779,1350,2
-5423,permute_810,call_function,permute.default,backward,12,1,1,1,4780,1349,2
-5424,permute_811,call_function,permute.default,backward,12,1,1,1,4780,1356,2
-5425,permute_812,call_function,permute.default,backward,12,1,1,1,4780,1355,2
-5426,convert_element_type_1384,call_function,convert_element_type.default,backward,12,1,1,1,4781,1355,2
-5427,convert_element_type_1385,call_function,convert_element_type.default,backward,12,1,1,1,4781,1354,2
-5428,view_987,call_function,view.default,backward,12,1,1,1,4782,1354,2
-5429,view_as_complex_86,call_function,view_as_complex.default,backward,12,1,1,1,4783,1353,6
-5430,_conj_30,call_function,_conj.default,backward,12,1,1,1,4,1354,3
-5431,clone_126,call_function,clone.default,backward,12,1,1,1,5,1353,3
-5432,mul_516,call_function,mul.Tensor,backward,12,2,2,1,4786,1352,8
-5433,view_988,call_function,view.default,backward,12,1,1,1,4782,1353,2
-5434,view_as_complex_87,call_function,view_as_complex.default,backward,12,1,1,1,4783,1352,6
-5435,_conj_31,call_function,_conj.default,backward,12,1,1,1,4,1353,3
-5436,clone_127,call_function,clone.default,backward,12,1,1,1,5,1352,3
-5437,mul_517,call_function,mul.Tensor,backward,12,2,2,1,4786,1351,8
-5438,view_as_real_86,call_function,view_as_real.default,backward,12,1,1,1,4787,1351,6
-5439,view_989,call_function,view.default,backward,12,1,1,1,4788,1350,6
-5440,convert_element_type_1386,call_function,convert_element_type.default,backward,12,1,1,1,4789,1349,6
-5441,view_as_real_87,call_function,view_as_real.default,backward,12,1,1,1,4787,1350,6
-5442,view_990,call_function,view.default,backward,12,1,1,1,4788,1349,6
-5443,convert_element_type_1387,call_function,convert_element_type.default,backward,12,1,1,1,4789,1348,6
-5444,view_991,call_function,view.default,backward,12,1,1,1,4781,1348,2
-5445,view_992,call_function,view.default,backward,12,1,1,1,4790,1348,5
-5446,view_993,call_function,view.default,backward,12,1,1,1,4790,1347,5
-5447,alias_default_1047,call_function,alias.default,backward,12,1,1,2,4782,1347,4
-5448,einsum_default_417,call_function,einsum.default,backward,12,2,2,1,4783,3,5
-5449,permute_815,call_function,permute.default,backward,12,1,1,1,4,1343,3
-5450,einsum_default_418,call_function,einsum.default,backward,12,2,2,1,4784,1342,5
-5451,permute_816,call_function,permute.default,backward,12,1,1,1,4784,2,4
-5452,dtype_cast_397,call_function,dtype_cast.default,backward,12,1,1,1,4785,1,4
-5453,alias_default_1357,call_function,alias.default,backward,12,1,1,0,4786,0,3
-5454,alias_default_1048,call_function,alias.default,backward,12,1,1,2,4791,1347,4
-5455,einsum_default_419,call_function,einsum.default,backward,12,2,2,1,4792,3,5
-5456,permute_819,call_function,permute.default,backward,12,1,1,1,4,1343,3
-5457,einsum_default_420,call_function,einsum.default,backward,12,2,2,1,4793,1342,5
-5458,add_250,call_function,add.Tensor,unknown,,2,2,1,4800,1341,10
-5459,permute_820,call_function,permute.default,backward,12,1,1,1,4793,2,4
-5460,dtype_cast_398,call_function,dtype_cast.default,backward,12,1,1,1,4794,1,4
-5461,alias_default_1356,call_function,alias.default,backward,12,1,1,0,4795,0,3
-5462,alias_default_1049,call_function,alias.default,backward,12,1,1,2,4791,1346,4
-5463,einsum_default_421,call_function,einsum.default,backward,12,2,2,1,4792,3,5
-5464,permute_823,call_function,permute.default,backward,12,1,1,1,4,1342,3
-5465,einsum_default_422,call_function,einsum.default,backward,12,2,2,1,4793,1341,5
-5466,add_251,call_function,add.Tensor,unknown,,2,2,1,4816,1340,10
-5467,permute_824,call_function,permute.default,backward,12,1,1,1,4793,2,4
-5468,dtype_cast_399,call_function,dtype_cast.default,backward,12,1,1,1,4794,1,4
-5469,alias_default_1355,call_function,alias.default,backward,12,1,1,0,4795,0,3
-5470,convert_element_type_1400,call_function,convert_element_type.default,backward,12,1,1,1,4817,1339,8
-5471,convert_element_type_1401,call_function,convert_element_type.default,backward,12,1,1,1,1329,1339,4
-5472,convert_element_type_1402,call_function,convert_element_type.default,backward,12,1,1,1,3,1333,2
-5473,alias_default_1050,call_function,alias.default,backward,12,1,1,2,4818,1338,4
-5474,mul_518,call_function,mul.Tensor,backward,12,2,2,1,4820,1332,8
-5475,mul_519,call_function,mul.Tensor,backward,12,2,2,1,1337,1338,8
-5476,alias_default_1051,call_function,alias.default,backward,12,1,1,2,4821,1331,4
-5477,alias_default_1052,call_function,alias.default,backward,12,1,1,3,1338,1337,4
-5478,mul_520,call_function,mul.Tensor,backward,12,2,2,1,4825,1330,8
-5479,sum_65,call_function,sum.dim_IntList,backward,12,1,1,1,4826,1329,5
-5480,div_60,call_function,div.Tensor,backward,12,1,1,1,1339,1329,6
-5481,mul_521,call_function,mul.Tensor,backward,12,2,2,1,4828,1328,8
-5482,sub_48,call_function,sub.Tensor,backward,12,2,2,1,4829,1327,10
-5483,mul_522,call_function,mul.Tensor,backward,12,2,2,1,4830,1326,8
-5484,mul_523,call_function,mul.Tensor,backward,12,2,2,1,4822,4,8
-5485,sum_66,call_function,sum.dim_IntList,backward,12,1,1,1,4823,3,5
-5486,convert_element_type_1403,call_function,convert_element_type.default,backward,12,1,1,1,4831,1325,6
-5487,convert_element_type_1404,call_function,convert_element_type.default,backward,12,1,1,1,4824,2,3
-5488,add_252,call_function,add.Tensor,unknown,,2,2,1,4832,1324,10
-5489,dtype_cast_400,call_function,dtype_cast.default,backward,12,1,1,1,4825,1,3
-5490,alias_default_1362,call_function,alias.default,backward,12,1,1,0,4826,0,2
-5491,alias_default_1053,call_function,alias.default,unknown,,1,1,3,4833,1323,4
-5492,einsum_default_423,call_function,einsum.default,backward,11,2,2,1,4834,3,5
-5493,permute_827,call_function,permute.default,backward,11,1,1,1,4,1319,3
-5494,einsum_default_424,call_function,einsum.default,backward,11,2,2,1,4835,1318,5
-5495,permute_828,call_function,permute.default,backward,11,1,1,1,4835,2,4
-5496,dtype_cast_401,call_function,dtype_cast.default,backward,11,1,1,1,4836,1,4
-5497,alias_default_1351,call_function,alias.default,backward,11,1,1,0,4837,0,3
-5498,alias_default_1054,call_function,alias.default,backward,11,1,1,2,4836,1317,4
-5499,mul_524,call_function,mul.Tensor,backward,11,2,2,1,4837,1305,8
-5500,mul_525,call_function,mul.Tensor,backward,11,2,2,1,4837,1309,8
-5501,alias_default_1055,call_function,alias.default,backward,11,1,1,2,4838,1304,4
-5502,einsum_default_425,call_function,einsum.default,backward,11,2,2,1,4839,3,5
-5503,permute_831,call_function,permute.default,backward,11,1,1,1,4,1300,3
-5504,einsum_default_426,call_function,einsum.default,backward,11,2,2,1,4840,1299,5
-5505,permute_832,call_function,permute.default,backward,11,1,1,1,4840,2,4
-5506,dtype_cast_402,call_function,dtype_cast.default,backward,11,1,1,1,4841,1,4
-5507,alias_default_1352,call_function,alias.default,backward,11,1,1,0,4842,0,3
-5508,convert_element_type_1413,call_function,convert_element_type.default,backward,11,1,1,1,4838,1308,6
-5509,convert_element_type_1414,call_function,convert_element_type.default,backward,11,1,1,1,1306,1318,4
-5510,alias_default_1056,call_function,alias.default,backward,11,1,1,2,1307,1317,4
-5511,neg_44,call_function,neg.default,backward,11,1,1,1,1308,1316,8
-5512,exp_44,call_function,exp.default,backward,11,1,1,1,1309,1315,6
-5513,add_253,call_function,add.Tensor,backward,11,1,1,1,1310,1314,4
-5514,reciprocal_16,call_function,reciprocal.default,backward,11,1,1,1,1311,1313,4
-5515,mul_526,call_function,mul.Tensor,backward,11,1,1,1,1312,1312,6
-5516,alias_default_1057,call_function,alias.default,backward,11,1,1,2,1313,1311,4
-5517,mul_527,call_function,mul.Tensor,backward,11,2,2,1,4847,1307,8
-5518,sub_49,call_function,sub.Tensor,backward,11,1,1,1,1314,1309,4
-5519,mul_528,call_function,mul.Tensor,backward,11,2,2,1,1315,1308,8
-5520,add_254,call_function,add.Tensor,backward,11,1,1,1,1316,1307,4
-5521,mul_529,call_function,mul.Tensor,backward,11,2,2,1,4851,1306,8
-5522,convert_element_type_1415,call_function,convert_element_type.default,backward,11,1,1,1,4852,1305,6
-5523,alias_default_1058,call_function,alias.default,backward,11,1,1,2,4853,1304,4
-5524,einsum_default_427,call_function,einsum.default,backward,11,2,2,1,4854,3,5
-5525,permute_835,call_function,permute.default,backward,11,1,1,1,4,1300,3
-5526,einsum_default_428,call_function,einsum.default,backward,11,2,2,1,4855,1299,5
-5527,add_255,call_function,add.Tensor,unknown,,2,2,1,4860,1298,10
-5528,permute_836,call_function,permute.default,backward,11,1,1,1,4855,2,4
-5529,dtype_cast_403,call_function,dtype_cast.default,backward,11,1,1,1,4856,1,4
-5530,alias_default_1350,call_function,alias.default,backward,11,1,1,0,4857,0,3
-5531,convert_element_type_1420,call_function,convert_element_type.default,backward,11,1,1,1,4861,1297,8
-5532,convert_element_type_1421,call_function,convert_element_type.default,backward,11,1,1,1,1286,1297,4
-5533,convert_element_type_1422,call_function,convert_element_type.default,backward,11,1,1,1,3,1291,2
-5534,alias_default_1059,call_function,alias.default,backward,11,1,1,2,4862,1296,4
-5535,mul_530,call_function,mul.Tensor,backward,11,2,2,1,4864,1290,8
-5536,mul_531,call_function,mul.Tensor,backward,11,2,2,1,1294,1296,8
-5537,alias_default_1060,call_function,alias.default,backward,11,1,1,2,4865,1289,4
-5538,alias_default_1061,call_function,alias.default,backward,11,1,1,3,1295,1295,4
-5539,mul_532,call_function,mul.Tensor,backward,11,2,2,1,4869,1288,8
-5540,sum_67,call_function,sum.dim_IntList,backward,11,1,1,1,4870,1287,5
-5541,div_61,call_function,div.Tensor,backward,11,1,1,1,1296,1287,6
-5542,mul_533,call_function,mul.Tensor,backward,11,2,2,1,4872,1286,8
-5543,sub_50,call_function,sub.Tensor,backward,11,2,2,1,4873,1285,10
-5544,mul_534,call_function,mul.Tensor,backward,11,2,2,1,4874,1284,8
-5545,mul_535,call_function,mul.Tensor,backward,11,2,2,1,4866,4,8
-5546,sum_68,call_function,sum.dim_IntList,backward,11,1,1,1,4867,3,5
-5547,convert_element_type_1423,call_function,convert_element_type.default,backward,11,1,1,1,4875,1283,6
-5548,convert_element_type_1424,call_function,convert_element_type.default,backward,11,1,1,1,4868,2,3
-5549,add_256,call_function,add.Tensor,unknown,,2,2,1,4876,1282,10
-5550,dtype_cast_404,call_function,dtype_cast.default,backward,11,1,1,1,4869,1,3
-5551,alias_default_1354,call_function,alias.default,backward,11,1,1,0,4870,0,2
-5552,alias_default_1062,call_function,alias.default,unknown,,1,1,3,4877,1281,4
-5553,einsum_default_429,call_function,einsum.default,backward,11,2,2,1,4878,3,5
-5554,permute_839,call_function,permute.default,backward,11,1,1,1,4,1277,3
-5555,einsum_default_430,call_function,einsum.default,backward,11,2,2,1,4879,1276,5
-5556,permute_840,call_function,permute.default,backward,11,1,1,1,4879,2,4
-5557,dtype_cast_405,call_function,dtype_cast.default,backward,11,1,1,1,4880,1,4
-5558,alias_default_1349,call_function,alias.default,backward,11,1,1,0,4881,0,3
-5559,view_1008,call_function,view.default,backward,11,1,1,1,4880,1275,4
-5560,permute_841,call_function,permute.default,backward,11,1,1,1,4881,1274,4
-5561,_scaled_dot_product_flash_attention_backward_16,call_function,_scaled_dot_product_flash_attention_backward.default,backward,11,8,8,3,4885,1273,2
-5562,getitem_300,call_function,getitem,backward,11,1,1,1,4886,1246,2
-5563,getitem_301,call_function,getitem,backward,11,1,1,1,4886,1247,2
-5564,getitem_302,call_function,getitem,backward,11,1,1,1,4886,1240,2
-5565,permute_842,call_function,permute.default,backward,11,1,1,1,4887,1239,2
-5566,permute_843,call_function,permute.default,backward,11,1,1,1,4887,1246,2
-5567,permute_844,call_function,permute.default,backward,11,1,1,1,4887,1245,2
-5568,convert_element_type_1429,call_function,convert_element_type.default,backward,11,1,1,1,4888,1245,2
-5569,convert_element_type_1430,call_function,convert_element_type.default,backward,11,1,1,1,4888,1244,2
-5570,view_1009,call_function,view.default,backward,11,1,1,1,4889,1244,2
-5571,view_as_complex_88,call_function,view_as_complex.default,backward,11,1,1,1,4890,1243,6
-5572,_conj_32,call_function,_conj.default,backward,11,1,1,1,4,1244,3
-5573,clone_134,call_function,clone.default,backward,11,1,1,1,5,1243,3
-5574,mul_536,call_function,mul.Tensor,backward,11,2,2,1,4893,1242,8
-5575,view_1010,call_function,view.default,backward,11,1,1,1,4889,1243,2
-5576,view_as_complex_89,call_function,view_as_complex.default,backward,11,1,1,1,4890,1242,6
-5577,_conj_33,call_function,_conj.default,backward,11,1,1,1,4,1243,3
-5578,clone_135,call_function,clone.default,backward,11,1,1,1,5,1242,3
-5579,mul_537,call_function,mul.Tensor,backward,11,2,2,1,4893,1241,8
-5580,view_as_real_88,call_function,view_as_real.default,backward,11,1,1,1,4894,1241,6
-5581,view_1011,call_function,view.default,backward,11,1,1,1,4895,1240,6
-5582,convert_element_type_1431,call_function,convert_element_type.default,backward,11,1,1,1,4896,1239,6
-5583,view_as_real_89,call_function,view_as_real.default,backward,11,1,1,1,4894,1240,6
-5584,view_1012,call_function,view.default,backward,11,1,1,1,4895,1239,6
-5585,convert_element_type_1432,call_function,convert_element_type.default,backward,11,1,1,1,4896,1238,6
-5586,view_1013,call_function,view.default,backward,11,1,1,1,4888,1238,2
-5587,view_1014,call_function,view.default,backward,11,1,1,1,4897,1238,5
-5588,view_1015,call_function,view.default,backward,11,1,1,1,4897,1237,5
-5589,alias_default_1063,call_function,alias.default,backward,11,1,1,2,4889,1237,4
-5590,einsum_default_431,call_function,einsum.default,backward,11,2,2,1,4890,3,5
-5591,permute_847,call_function,permute.default,backward,11,1,1,1,4,1233,3
-5592,einsum_default_432,call_function,einsum.default,backward,11,2,2,1,4891,1232,5
-5593,permute_848,call_function,permute.default,backward,11,1,1,1,4891,2,4
-5594,dtype_cast_406,call_function,dtype_cast.default,backward,11,1,1,1,4892,1,4
-5595,alias_default_1348,call_function,alias.default,backward,11,1,1,0,4893,0,3
-5596,alias_default_1064,call_function,alias.default,backward,11,1,1,2,4898,1237,4
-5597,einsum_default_433,call_function,einsum.default,backward,11,2,2,1,4899,3,5
-5598,permute_851,call_function,permute.default,backward,11,1,1,1,4,1233,3
-5599,einsum_default_434,call_function,einsum.default,backward,11,2,2,1,4900,1232,5
-5600,add_257,call_function,add.Tensor,unknown,,2,2,1,4907,1231,10
-5601,permute_852,call_function,permute.default,backward,11,1,1,1,4900,2,4
-5602,dtype_cast_407,call_function,dtype_cast.default,backward,11,1,1,1,4901,1,4
-5603,alias_default_1347,call_function,alias.default,backward,11,1,1,0,4902,0,3
-5604,alias_default_1065,call_function,alias.default,backward,11,1,1,2,4898,1236,4
-5605,einsum_default_435,call_function,einsum.default,backward,11,2,2,1,4899,3,5
-5606,permute_855,call_function,permute.default,backward,11,1,1,1,4,1232,3
-5607,einsum_default_436,call_function,einsum.default,backward,11,2,2,1,4900,1231,5
-5608,add_258,call_function,add.Tensor,unknown,,2,2,1,4923,1230,10
-5609,permute_856,call_function,permute.default,backward,11,1,1,1,4900,2,4
-5610,dtype_cast_408,call_function,dtype_cast.default,backward,11,1,1,1,4901,1,4
-5611,alias_default_1346,call_function,alias.default,backward,11,1,1,0,4902,0,3
-5612,convert_element_type_1445,call_function,convert_element_type.default,backward,11,1,1,1,4924,1229,8
-5613,convert_element_type_1446,call_function,convert_element_type.default,backward,11,1,1,1,1219,1229,4
-5614,convert_element_type_1447,call_function,convert_element_type.default,backward,11,1,1,1,3,1223,2
-5615,alias_default_1066,call_function,alias.default,backward,11,1,1,2,4925,1228,4
-5616,mul_538,call_function,mul.Tensor,backward,11,2,2,1,4927,1222,8
-5617,mul_539,call_function,mul.Tensor,backward,11,2,2,1,1227,1228,8
-5618,alias_default_1067,call_function,alias.default,backward,11,1,1,2,4928,1221,4
-5619,alias_default_1068,call_function,alias.default,backward,11,1,1,3,1228,1227,4
-5620,mul_540,call_function,mul.Tensor,backward,11,2,2,1,4932,1220,8
-5621,sum_69,call_function,sum.dim_IntList,backward,11,1,1,1,4933,1219,5
-5622,div_62,call_function,div.Tensor,backward,11,1,1,1,1229,1219,6
-5623,mul_541,call_function,mul.Tensor,backward,11,2,2,1,4935,1218,8
-5624,sub_51,call_function,sub.Tensor,backward,11,2,2,1,4936,1217,10
-5625,mul_542,call_function,mul.Tensor,backward,11,2,2,1,4937,1216,8
-5626,mul_543,call_function,mul.Tensor,backward,11,2,2,1,4929,4,8
-5627,sum_70,call_function,sum.dim_IntList,backward,11,1,1,1,4930,3,5
-5628,convert_element_type_1448,call_function,convert_element_type.default,backward,11,1,1,1,4938,1215,6
-5629,convert_element_type_1449,call_function,convert_element_type.default,backward,11,1,1,1,4931,2,3
-5630,add_259,call_function,add.Tensor,unknown,,2,2,1,4939,1214,10
-5631,dtype_cast_409,call_function,dtype_cast.default,backward,11,1,1,1,4932,1,3
-5632,alias_default_1353,call_function,alias.default,backward,11,1,1,0,4933,0,2
-5633,alias_default_1069,call_function,alias.default,unknown,,1,1,3,4940,1213,4
-5634,einsum_default_437,call_function,einsum.default,backward,10,2,2,1,4941,3,5
-5635,permute_859,call_function,permute.default,backward,10,1,1,1,4,1209,3
-5636,einsum_default_438,call_function,einsum.default,backward,10,2,2,1,4942,1208,5
-5637,permute_860,call_function,permute.default,backward,10,1,1,1,4942,2,4
-5638,dtype_cast_410,call_function,dtype_cast.default,backward,10,1,1,1,4943,1,4
-5639,alias_default_1342,call_function,alias.default,backward,10,1,1,0,4944,0,3
-5640,alias_default_1070,call_function,alias.default,backward,10,1,1,2,4943,1207,4
-5641,mul_544,call_function,mul.Tensor,backward,10,2,2,1,4944,1195,8
-5642,mul_545,call_function,mul.Tensor,backward,10,2,2,1,4944,1199,8
-5643,alias_default_1071,call_function,alias.default,backward,10,1,1,2,4945,1194,4
-5644,einsum_default_439,call_function,einsum.default,backward,10,2,2,1,4946,3,5
-5645,permute_863,call_function,permute.default,backward,10,1,1,1,4,1190,3
-5646,einsum_default_440,call_function,einsum.default,backward,10,2,2,1,4947,1189,5
-5647,permute_864,call_function,permute.default,backward,10,1,1,1,4947,2,4
-5648,dtype_cast_411,call_function,dtype_cast.default,backward,10,1,1,1,4948,1,4
-5649,alias_default_1343,call_function,alias.default,backward,10,1,1,0,4949,0,3
-5650,convert_element_type_1458,call_function,convert_element_type.default,backward,10,1,1,1,4945,1198,6
-5651,convert_element_type_1459,call_function,convert_element_type.default,backward,10,1,1,1,1196,1208,4
-5652,alias_default_1072,call_function,alias.default,backward,10,1,1,2,1197,1207,4
-5653,neg_45,call_function,neg.default,backward,10,1,1,1,1198,1206,8
-5654,exp_45,call_function,exp.default,backward,10,1,1,1,1199,1205,6
-5655,add_260,call_function,add.Tensor,backward,10,1,1,1,1200,1204,4
-5656,reciprocal_17,call_function,reciprocal.default,backward,10,1,1,1,1201,1203,4
-5657,mul_546,call_function,mul.Tensor,backward,10,1,1,1,1202,1202,6
-5658,alias_default_1073,call_function,alias.default,backward,10,1,1,2,1203,1201,4
-5659,mul_547,call_function,mul.Tensor,backward,10,2,2,1,4954,1197,8
-5660,sub_52,call_function,sub.Tensor,backward,10,1,1,1,1204,1199,4
-5661,mul_548,call_function,mul.Tensor,backward,10,2,2,1,1205,1198,8
-5662,add_261,call_function,add.Tensor,backward,10,1,1,1,1206,1197,4
-5663,mul_549,call_function,mul.Tensor,backward,10,2,2,1,4958,1196,8
-5664,convert_element_type_1460,call_function,convert_element_type.default,backward,10,1,1,1,4959,1195,6
-5665,alias_default_1074,call_function,alias.default,backward,10,1,1,2,4960,1194,4
-5666,einsum_default_441,call_function,einsum.default,backward,10,2,2,1,4961,3,5
-5667,permute_867,call_function,permute.default,backward,10,1,1,1,4,1190,3
-5668,einsum_default_442,call_function,einsum.default,backward,10,2,2,1,4962,1189,5
-5669,add_262,call_function,add.Tensor,unknown,,2,2,1,4967,1188,10
-5670,permute_868,call_function,permute.default,backward,10,1,1,1,4962,2,4
-5671,dtype_cast_412,call_function,dtype_cast.default,backward,10,1,1,1,4963,1,4
-5672,alias_default_1341,call_function,alias.default,backward,10,1,1,0,4964,0,3
-5673,convert_element_type_1465,call_function,convert_element_type.default,backward,10,1,1,1,4968,1187,8
-5674,convert_element_type_1466,call_function,convert_element_type.default,backward,10,1,1,1,1176,1187,4
-5675,convert_element_type_1467,call_function,convert_element_type.default,backward,10,1,1,1,3,1181,2
-5676,alias_default_1075,call_function,alias.default,backward,10,1,1,2,4969,1186,4
-5677,mul_550,call_function,mul.Tensor,backward,10,2,2,1,4971,1180,8
-5678,mul_551,call_function,mul.Tensor,backward,10,2,2,1,1184,1186,8
-5679,alias_default_1076,call_function,alias.default,backward,10,1,1,2,4972,1179,4
-5680,alias_default_1077,call_function,alias.default,backward,10,1,1,3,1185,1185,4
-5681,mul_552,call_function,mul.Tensor,backward,10,2,2,1,4976,1178,8
-5682,sum_71,call_function,sum.dim_IntList,backward,10,1,1,1,4977,1177,5
-5683,div_63,call_function,div.Tensor,backward,10,1,1,1,1186,1177,6
-5684,mul_553,call_function,mul.Tensor,backward,10,2,2,1,4979,1176,8
-5685,sub_53,call_function,sub.Tensor,backward,10,2,2,1,4980,1175,10
-5686,mul_554,call_function,mul.Tensor,backward,10,2,2,1,4981,1174,8
-5687,mul_555,call_function,mul.Tensor,backward,10,2,2,1,4973,4,8
-5688,sum_72,call_function,sum.dim_IntList,backward,10,1,1,1,4974,3,5
-5689,convert_element_type_1468,call_function,convert_element_type.default,backward,10,1,1,1,4982,1173,6
-5690,convert_element_type_1469,call_function,convert_element_type.default,backward,10,1,1,1,4975,2,3
-5691,add_263,call_function,add.Tensor,unknown,,2,2,1,4983,1172,10
-5692,dtype_cast_413,call_function,dtype_cast.default,backward,10,1,1,1,4976,1,3
-5693,alias_default_1345,call_function,alias.default,backward,10,1,1,0,4977,0,2
-5694,alias_default_1078,call_function,alias.default,unknown,,1,1,3,4984,1171,4
-5695,einsum_default_443,call_function,einsum.default,backward,10,2,2,1,4985,3,5
-5696,permute_871,call_function,permute.default,backward,10,1,1,1,4,1167,3
-5697,einsum_default_444,call_function,einsum.default,backward,10,2,2,1,4986,1166,5
-5698,permute_872,call_function,permute.default,backward,10,1,1,1,4986,2,4
-5699,dtype_cast_414,call_function,dtype_cast.default,backward,10,1,1,1,4987,1,4
-5700,alias_default_1340,call_function,alias.default,backward,10,1,1,0,4988,0,3
-5701,view_1030,call_function,view.default,backward,10,1,1,1,4987,1165,4
-5702,permute_873,call_function,permute.default,backward,10,1,1,1,4988,1164,4
-5703,_scaled_dot_product_flash_attention_backward_17,call_function,_scaled_dot_product_flash_attention_backward.default,backward,10,8,8,3,4992,1163,2
-5704,getitem_303,call_function,getitem,backward,10,1,1,1,4993,1136,2
-5705,getitem_304,call_function,getitem,backward,10,1,1,1,4993,1137,2
-5706,getitem_305,call_function,getitem,backward,10,1,1,1,4993,1130,2
-5707,permute_874,call_function,permute.default,backward,10,1,1,1,4994,1129,2
-5708,permute_875,call_function,permute.default,backward,10,1,1,1,4994,1136,2
-5709,permute_876,call_function,permute.default,backward,10,1,1,1,4994,1135,2
-5710,convert_element_type_1474,call_function,convert_element_type.default,backward,10,1,1,1,4995,1135,2
-5711,convert_element_type_1475,call_function,convert_element_type.default,backward,10,1,1,1,4995,1134,2
-5712,view_1031,call_function,view.default,backward,10,1,1,1,4996,1134,2
-5713,view_as_complex_90,call_function,view_as_complex.default,backward,10,1,1,1,4997,1133,6
-5714,_conj_34,call_function,_conj.default,backward,10,1,1,1,4,1134,3
-5715,clone_142,call_function,clone.default,backward,10,1,1,1,5,1133,3
-5716,mul_556,call_function,mul.Tensor,backward,10,2,2,1,5000,1132,8
-5717,view_1032,call_function,view.default,backward,10,1,1,1,4996,1133,2
-5718,view_as_complex_91,call_function,view_as_complex.default,backward,10,1,1,1,4997,1132,6
-5719,_conj_35,call_function,_conj.default,backward,10,1,1,1,4,1133,3
-5720,clone_143,call_function,clone.default,backward,10,1,1,1,5,1132,3
-5721,mul_557,call_function,mul.Tensor,backward,10,2,2,1,5000,1131,8
-5722,view_as_real_90,call_function,view_as_real.default,backward,10,1,1,1,5001,1131,6
-5723,view_1033,call_function,view.default,backward,10,1,1,1,5002,1130,6
-5724,convert_element_type_1476,call_function,convert_element_type.default,backward,10,1,1,1,5003,1129,6
-5725,view_as_real_91,call_function,view_as_real.default,backward,10,1,1,1,5001,1130,6
-5726,view_1034,call_function,view.default,backward,10,1,1,1,5002,1129,6
-5727,convert_element_type_1477,call_function,convert_element_type.default,backward,10,1,1,1,5003,1128,6
-5728,view_1035,call_function,view.default,backward,10,1,1,1,4995,1128,2
-5729,view_1036,call_function,view.default,backward,10,1,1,1,5004,1128,5
-5730,view_1037,call_function,view.default,backward,10,1,1,1,5004,1127,5
-5731,alias_default_1079,call_function,alias.default,backward,10,1,1,2,4996,1127,4
-5732,einsum_default_445,call_function,einsum.default,backward,10,2,2,1,4997,3,5
-5733,permute_879,call_function,permute.default,backward,10,1,1,1,4,1123,3
-5734,einsum_default_446,call_function,einsum.default,backward,10,2,2,1,4998,1122,5
-5735,permute_880,call_function,permute.default,backward,10,1,1,1,4998,2,4
-5736,dtype_cast_415,call_function,dtype_cast.default,backward,10,1,1,1,4999,1,4
-5737,alias_default_1339,call_function,alias.default,backward,10,1,1,0,5000,0,3
-5738,alias_default_1080,call_function,alias.default,backward,10,1,1,2,5005,1127,4
-5739,einsum_default_447,call_function,einsum.default,backward,10,2,2,1,5006,3,5
-5740,permute_883,call_function,permute.default,backward,10,1,1,1,4,1123,3
-5741,einsum_default_448,call_function,einsum.default,backward,10,2,2,1,5007,1122,5
-5742,add_264,call_function,add.Tensor,unknown,,2,2,1,5014,1121,10
-5743,permute_884,call_function,permute.default,backward,10,1,1,1,5007,2,4
-5744,dtype_cast_416,call_function,dtype_cast.default,backward,10,1,1,1,5008,1,4
-5745,alias_default_1338,call_function,alias.default,backward,10,1,1,0,5009,0,3
-5746,alias_default_1081,call_function,alias.default,backward,10,1,1,2,5005,1126,4
-5747,einsum_default_449,call_function,einsum.default,backward,10,2,2,1,5006,3,5
-5748,permute_887,call_function,permute.default,backward,10,1,1,1,4,1122,3
-5749,einsum_default_450,call_function,einsum.default,backward,10,2,2,1,5007,1121,5
-5750,add_265,call_function,add.Tensor,unknown,,2,2,1,5030,1120,10
-5751,permute_888,call_function,permute.default,backward,10,1,1,1,5007,2,4
-5752,dtype_cast_417,call_function,dtype_cast.default,backward,10,1,1,1,5008,1,4
-5753,alias_default_1337,call_function,alias.default,backward,10,1,1,0,5009,0,3
-5754,convert_element_type_1490,call_function,convert_element_type.default,backward,10,1,1,1,5031,1119,8
-5755,convert_element_type_1491,call_function,convert_element_type.default,backward,10,1,1,1,1109,1119,4
-5756,convert_element_type_1492,call_function,convert_element_type.default,backward,10,1,1,1,3,1113,2
-5757,alias_default_1082,call_function,alias.default,backward,10,1,1,2,5032,1118,4
-5758,mul_558,call_function,mul.Tensor,backward,10,2,2,1,5034,1112,8
-5759,mul_559,call_function,mul.Tensor,backward,10,2,2,1,1117,1118,8
-5760,alias_default_1083,call_function,alias.default,backward,10,1,1,2,5035,1111,4
-5761,alias_default_1084,call_function,alias.default,backward,10,1,1,3,1118,1117,4
-5762,mul_560,call_function,mul.Tensor,backward,10,2,2,1,5039,1110,8
-5763,sum_73,call_function,sum.dim_IntList,backward,10,1,1,1,5040,1109,5
-5764,div_64,call_function,div.Tensor,backward,10,1,1,1,1119,1109,6
-5765,mul_561,call_function,mul.Tensor,backward,10,2,2,1,5042,1108,8
-5766,sub_54,call_function,sub.Tensor,backward,10,2,2,1,5043,1107,10
-5767,mul_562,call_function,mul.Tensor,backward,10,2,2,1,5044,1106,8
-5768,mul_563,call_function,mul.Tensor,backward,10,2,2,1,5036,4,8
-5769,sum_74,call_function,sum.dim_IntList,backward,10,1,1,1,5037,3,5
-5770,convert_element_type_1493,call_function,convert_element_type.default,backward,10,1,1,1,5045,1105,6
-5771,convert_element_type_1494,call_function,convert_element_type.default,backward,10,1,1,1,5038,2,3
-5772,add_266,call_function,add.Tensor,unknown,,2,2,1,5046,1104,10
-5773,dtype_cast_418,call_function,dtype_cast.default,backward,10,1,1,1,5039,1,3
-5774,alias_default_1344,call_function,alias.default,backward,10,1,1,0,5040,0,2
-5775,alias_default_1085,call_function,alias.default,unknown,,1,1,3,5047,1103,4
-5776,einsum_default_451,call_function,einsum.default,backward,9,2,2,1,5048,3,5
-5777,permute_891,call_function,permute.default,backward,9,1,1,1,4,1099,3
-5778,einsum_default_452,call_function,einsum.default,backward,9,2,2,1,5049,1098,5
-5779,permute_892,call_function,permute.default,backward,9,1,1,1,5049,2,4
-5780,dtype_cast_419,call_function,dtype_cast.default,backward,9,1,1,1,5050,1,4
-5781,alias_default_1333,call_function,alias.default,backward,9,1,1,0,5051,0,3
-5782,alias_default_1086,call_function,alias.default,backward,9,1,1,2,5050,1097,4
-5783,mul_564,call_function,mul.Tensor,backward,9,2,2,1,5051,1085,8
-5784,mul_565,call_function,mul.Tensor,backward,9,2,2,1,5051,1089,8
-5785,alias_default_1087,call_function,alias.default,backward,9,1,1,2,5052,1084,4
-5786,einsum_default_453,call_function,einsum.default,backward,9,2,2,1,5053,3,5
-5787,permute_895,call_function,permute.default,backward,9,1,1,1,4,1080,3
-5788,einsum_default_454,call_function,einsum.default,backward,9,2,2,1,5054,1079,5
-5789,permute_896,call_function,permute.default,backward,9,1,1,1,5054,2,4
-5790,dtype_cast_420,call_function,dtype_cast.default,backward,9,1,1,1,5055,1,4
-5791,alias_default_1334,call_function,alias.default,backward,9,1,1,0,5056,0,3
-5792,convert_element_type_1503,call_function,convert_element_type.default,backward,9,1,1,1,5052,1088,6
-5793,convert_element_type_1504,call_function,convert_element_type.default,backward,9,1,1,1,1086,1098,4
-5794,alias_default_1088,call_function,alias.default,backward,9,1,1,2,1087,1097,4
-5795,neg_46,call_function,neg.default,backward,9,1,1,1,1088,1096,8
-5796,exp_46,call_function,exp.default,backward,9,1,1,1,1089,1095,6
-5797,add_267,call_function,add.Tensor,backward,9,1,1,1,1090,1094,4
-5798,reciprocal_18,call_function,reciprocal.default,backward,9,1,1,1,1091,1093,4
-5799,mul_566,call_function,mul.Tensor,backward,9,1,1,1,1092,1092,6
-5800,alias_default_1089,call_function,alias.default,backward,9,1,1,2,1093,1091,4
-5801,mul_567,call_function,mul.Tensor,backward,9,2,2,1,5061,1087,8
-5802,sub_55,call_function,sub.Tensor,backward,9,1,1,1,1094,1089,4
-5803,mul_568,call_function,mul.Tensor,backward,9,2,2,1,1095,1088,8
-5804,add_268,call_function,add.Tensor,backward,9,1,1,1,1096,1087,4
-5805,mul_569,call_function,mul.Tensor,backward,9,2,2,1,5065,1086,8
-5806,convert_element_type_1505,call_function,convert_element_type.default,backward,9,1,1,1,5066,1085,6
-5807,alias_default_1090,call_function,alias.default,backward,9,1,1,2,5067,1084,4
-5808,einsum_default_455,call_function,einsum.default,backward,9,2,2,1,5068,3,5
-5809,permute_899,call_function,permute.default,backward,9,1,1,1,4,1080,3
-5810,einsum_default_456,call_function,einsum.default,backward,9,2,2,1,5069,1079,5
-5811,add_269,call_function,add.Tensor,unknown,,2,2,1,5074,1078,10
-5812,permute_900,call_function,permute.default,backward,9,1,1,1,5069,2,4
-5813,dtype_cast_421,call_function,dtype_cast.default,backward,9,1,1,1,5070,1,4
-5814,alias_default_1332,call_function,alias.default,backward,9,1,1,0,5071,0,3
-5815,convert_element_type_1510,call_function,convert_element_type.default,backward,9,1,1,1,5075,1077,8
-5816,convert_element_type_1511,call_function,convert_element_type.default,backward,9,1,1,1,1066,1077,4
-5817,convert_element_type_1512,call_function,convert_element_type.default,backward,9,1,1,1,3,1071,2
-5818,alias_default_1091,call_function,alias.default,backward,9,1,1,2,5076,1076,4
-5819,mul_570,call_function,mul.Tensor,backward,9,2,2,1,5078,1070,8
-5820,mul_571,call_function,mul.Tensor,backward,9,2,2,1,1074,1076,8
-5821,alias_default_1092,call_function,alias.default,backward,9,1,1,2,5079,1069,4
-5822,alias_default_1093,call_function,alias.default,backward,9,1,1,3,1075,1075,4
-5823,mul_572,call_function,mul.Tensor,backward,9,2,2,1,5083,1068,8
-5824,sum_75,call_function,sum.dim_IntList,backward,9,1,1,1,5084,1067,5
-5825,div_65,call_function,div.Tensor,backward,9,1,1,1,1076,1067,6
-5826,mul_573,call_function,mul.Tensor,backward,9,2,2,1,5086,1066,8
-5827,sub_56,call_function,sub.Tensor,backward,9,2,2,1,5087,1065,10
-5828,mul_574,call_function,mul.Tensor,backward,9,2,2,1,5088,1064,8
-5829,mul_575,call_function,mul.Tensor,backward,9,2,2,1,5080,4,8
-5830,sum_76,call_function,sum.dim_IntList,backward,9,1,1,1,5081,3,5
-5831,convert_element_type_1513,call_function,convert_element_type.default,backward,9,1,1,1,5089,1063,6
-5832,convert_element_type_1514,call_function,convert_element_type.default,backward,9,1,1,1,5082,2,3
-5833,add_270,call_function,add.Tensor,unknown,,2,2,1,5090,1062,10
-5834,dtype_cast_422,call_function,dtype_cast.default,backward,9,1,1,1,5083,1,3
-5835,alias_default_1336,call_function,alias.default,backward,9,1,1,0,5084,0,2
-5836,alias_default_1094,call_function,alias.default,unknown,,1,1,3,5091,1061,4
-5837,einsum_default_457,call_function,einsum.default,backward,9,2,2,1,5092,3,5
-5838,permute_903,call_function,permute.default,backward,9,1,1,1,4,1057,3
-5839,einsum_default_458,call_function,einsum.default,backward,9,2,2,1,5093,1056,5
-5840,permute_904,call_function,permute.default,backward,9,1,1,1,5093,2,4
-5841,dtype_cast_423,call_function,dtype_cast.default,backward,9,1,1,1,5094,1,4
-5842,alias_default_1331,call_function,alias.default,backward,9,1,1,0,5095,0,3
-5843,view_1052,call_function,view.default,backward,9,1,1,1,5094,1055,4
-5844,permute_905,call_function,permute.default,backward,9,1,1,1,5095,1054,4
-5845,_scaled_dot_product_flash_attention_backward_18,call_function,_scaled_dot_product_flash_attention_backward.default,backward,9,8,8,3,5099,1053,2
-5846,getitem_306,call_function,getitem,backward,9,1,1,1,5100,1026,2
-5847,getitem_307,call_function,getitem,backward,9,1,1,1,5100,1027,2
-5848,getitem_308,call_function,getitem,backward,9,1,1,1,5100,1020,2
-5849,permute_906,call_function,permute.default,backward,9,1,1,1,5101,1019,2
-5850,permute_907,call_function,permute.default,backward,9,1,1,1,5101,1026,2
-5851,permute_908,call_function,permute.default,backward,9,1,1,1,5101,1025,2
-5852,convert_element_type_1519,call_function,convert_element_type.default,backward,9,1,1,1,5102,1025,2
-5853,convert_element_type_1520,call_function,convert_element_type.default,backward,9,1,1,1,5102,1024,2
-5854,view_1053,call_function,view.default,backward,9,1,1,1,5103,1024,2
-5855,view_as_complex_92,call_function,view_as_complex.default,backward,9,1,1,1,5104,1023,6
-5856,_conj_36,call_function,_conj.default,backward,9,1,1,1,4,1024,3
-5857,clone_150,call_function,clone.default,backward,9,1,1,1,5,1023,3
-5858,mul_576,call_function,mul.Tensor,backward,9,2,2,1,5107,1022,8
-5859,view_1054,call_function,view.default,backward,9,1,1,1,5103,1023,2
-5860,view_as_complex_93,call_function,view_as_complex.default,backward,9,1,1,1,5104,1022,6
-5861,_conj_37,call_function,_conj.default,backward,9,1,1,1,4,1023,3
-5862,clone_151,call_function,clone.default,backward,9,1,1,1,5,1022,3
-5863,mul_577,call_function,mul.Tensor,backward,9,2,2,1,5107,1021,8
-5864,view_as_real_92,call_function,view_as_real.default,backward,9,1,1,1,5108,1021,6
-5865,view_1055,call_function,view.default,backward,9,1,1,1,5109,1020,6
-5866,convert_element_type_1521,call_function,convert_element_type.default,backward,9,1,1,1,5110,1019,6
-5867,view_as_real_93,call_function,view_as_real.default,backward,9,1,1,1,5108,1020,6
-5868,view_1056,call_function,view.default,backward,9,1,1,1,5109,1019,6
-5869,convert_element_type_1522,call_function,convert_element_type.default,backward,9,1,1,1,5110,1018,6
-5870,view_1057,call_function,view.default,backward,9,1,1,1,5102,1018,2
-5871,view_1058,call_function,view.default,backward,9,1,1,1,5111,1018,5
-5872,view_1059,call_function,view.default,backward,9,1,1,1,5111,1017,5
-5873,alias_default_1095,call_function,alias.default,backward,9,1,1,2,5103,1017,4
-5874,einsum_default_459,call_function,einsum.default,backward,9,2,2,1,5104,3,5
-5875,permute_911,call_function,permute.default,backward,9,1,1,1,4,1013,3
-5876,einsum_default_460,call_function,einsum.default,backward,9,2,2,1,5105,1012,5
-5877,permute_912,call_function,permute.default,backward,9,1,1,1,5105,2,4
-5878,dtype_cast_424,call_function,dtype_cast.default,backward,9,1,1,1,5106,1,4
-5879,alias_default_1330,call_function,alias.default,backward,9,1,1,0,5107,0,3
-5880,alias_default_1096,call_function,alias.default,backward,9,1,1,2,5112,1017,4
-5881,einsum_default_461,call_function,einsum.default,backward,9,2,2,1,5113,3,5
-5882,permute_915,call_function,permute.default,backward,9,1,1,1,4,1013,3
-5883,einsum_default_462,call_function,einsum.default,backward,9,2,2,1,5114,1012,5
-5884,add_271,call_function,add.Tensor,unknown,,2,2,1,5121,1011,10
-5885,permute_916,call_function,permute.default,backward,9,1,1,1,5114,2,4
-5886,dtype_cast_425,call_function,dtype_cast.default,backward,9,1,1,1,5115,1,4
-5887,alias_default_1329,call_function,alias.default,backward,9,1,1,0,5116,0,3
-5888,alias_default_1097,call_function,alias.default,backward,9,1,1,2,5112,1016,4
-5889,einsum_default_463,call_function,einsum.default,backward,9,2,2,1,5113,3,5
-5890,permute_919,call_function,permute.default,backward,9,1,1,1,4,1012,3
-5891,einsum_default_464,call_function,einsum.default,backward,9,2,2,1,5114,1011,5
-5892,add_272,call_function,add.Tensor,unknown,,2,2,1,5137,1010,10
-5893,permute_920,call_function,permute.default,backward,9,1,1,1,5114,2,4
-5894,dtype_cast_426,call_function,dtype_cast.default,backward,9,1,1,1,5115,1,4
-5895,alias_default_1328,call_function,alias.default,backward,9,1,1,0,5116,0,3
-5896,convert_element_type_1535,call_function,convert_element_type.default,backward,9,1,1,1,5138,1009,8
-5897,convert_element_type_1536,call_function,convert_element_type.default,backward,9,1,1,1,999,1009,4
-5898,convert_element_type_1537,call_function,convert_element_type.default,backward,9,1,1,1,3,1003,2
-5899,alias_default_1098,call_function,alias.default,backward,9,1,1,2,5139,1008,4
-5900,mul_578,call_function,mul.Tensor,backward,9,2,2,1,5141,1002,8
-5901,mul_579,call_function,mul.Tensor,backward,9,2,2,1,1007,1008,8
-5902,alias_default_1099,call_function,alias.default,backward,9,1,1,2,5142,1001,4
-5903,alias_default_1100,call_function,alias.default,backward,9,1,1,3,1008,1007,4
-5904,mul_580,call_function,mul.Tensor,backward,9,2,2,1,5146,1000,8
-5905,sum_77,call_function,sum.dim_IntList,backward,9,1,1,1,5147,999,5
-5906,div_66,call_function,div.Tensor,backward,9,1,1,1,1009,999,6
-5907,mul_581,call_function,mul.Tensor,backward,9,2,2,1,5149,998,8
-5908,sub_57,call_function,sub.Tensor,backward,9,2,2,1,5150,997,10
-5909,mul_582,call_function,mul.Tensor,backward,9,2,2,1,5151,996,8
-5910,mul_583,call_function,mul.Tensor,backward,9,2,2,1,5143,4,8
-5911,sum_78,call_function,sum.dim_IntList,backward,9,1,1,1,5144,3,5
-5912,convert_element_type_1538,call_function,convert_element_type.default,backward,9,1,1,1,5152,995,6
-5913,convert_element_type_1539,call_function,convert_element_type.default,backward,9,1,1,1,5145,2,3
-5914,add_273,call_function,add.Tensor,unknown,,2,2,1,5153,994,10
-5915,dtype_cast_427,call_function,dtype_cast.default,backward,9,1,1,1,5146,1,3
-5916,alias_default_1335,call_function,alias.default,backward,9,1,1,0,5147,0,2
-5917,alias_default_1101,call_function,alias.default,unknown,,1,1,3,5154,993,4
-5918,einsum_default_465,call_function,einsum.default,backward,8,2,2,1,5155,3,5
-5919,permute_923,call_function,permute.default,backward,8,1,1,1,4,989,3
-5920,einsum_default_466,call_function,einsum.default,backward,8,2,2,1,5156,988,5
-5921,permute_924,call_function,permute.default,backward,8,1,1,1,5156,2,4
-5922,dtype_cast_428,call_function,dtype_cast.default,backward,8,1,1,1,5157,1,4
-5923,alias_default_1324,call_function,alias.default,backward,8,1,1,0,5158,0,3
-5924,alias_default_1102,call_function,alias.default,backward,8,1,1,2,5157,987,4
-5925,mul_584,call_function,mul.Tensor,backward,8,2,2,1,5158,975,8
-5926,mul_585,call_function,mul.Tensor,backward,8,2,2,1,5158,979,8
-5927,alias_default_1103,call_function,alias.default,backward,8,1,1,2,5159,974,4
-5928,einsum_default_467,call_function,einsum.default,backward,8,2,2,1,5160,3,5
-5929,permute_927,call_function,permute.default,backward,8,1,1,1,4,970,3
-5930,einsum_default_468,call_function,einsum.default,backward,8,2,2,1,5161,969,5
-5931,permute_928,call_function,permute.default,backward,8,1,1,1,5161,2,4
-5932,dtype_cast_429,call_function,dtype_cast.default,backward,8,1,1,1,5162,1,4
-5933,alias_default_1325,call_function,alias.default,backward,8,1,1,0,5163,0,3
-5934,convert_element_type_1548,call_function,convert_element_type.default,backward,8,1,1,1,5159,978,6
-5935,convert_element_type_1549,call_function,convert_element_type.default,backward,8,1,1,1,976,988,4
-5936,alias_default_1104,call_function,alias.default,backward,8,1,1,2,977,987,4
-5937,neg_47,call_function,neg.default,backward,8,1,1,1,978,986,8
-5938,exp_47,call_function,exp.default,backward,8,1,1,1,979,985,6
-5939,add_274,call_function,add.Tensor,backward,8,1,1,1,980,984,4
-5940,reciprocal_19,call_function,reciprocal.default,backward,8,1,1,1,981,983,4
-5941,mul_586,call_function,mul.Tensor,backward,8,1,1,1,982,982,6
-5942,alias_default_1105,call_function,alias.default,backward,8,1,1,2,983,981,4
-5943,mul_587,call_function,mul.Tensor,backward,8,2,2,1,5168,977,8
-5944,sub_58,call_function,sub.Tensor,backward,8,1,1,1,984,979,4
-5945,mul_588,call_function,mul.Tensor,backward,8,2,2,1,985,978,8
-5946,add_275,call_function,add.Tensor,backward,8,1,1,1,986,977,4
-5947,mul_589,call_function,mul.Tensor,backward,8,2,2,1,5172,976,8
-5948,convert_element_type_1550,call_function,convert_element_type.default,backward,8,1,1,1,5173,975,6
-5949,alias_default_1106,call_function,alias.default,backward,8,1,1,2,5174,974,4
-5950,einsum_default_469,call_function,einsum.default,backward,8,2,2,1,5175,3,5
-5951,permute_931,call_function,permute.default,backward,8,1,1,1,4,970,3
-5952,einsum_default_470,call_function,einsum.default,backward,8,2,2,1,5176,969,5
-5953,add_276,call_function,add.Tensor,unknown,,2,2,1,5181,968,10
-5954,permute_932,call_function,permute.default,backward,8,1,1,1,5176,2,4
-5955,dtype_cast_430,call_function,dtype_cast.default,backward,8,1,1,1,5177,1,4
-5956,alias_default_1323,call_function,alias.default,backward,8,1,1,0,5178,0,3
-5957,convert_element_type_1555,call_function,convert_element_type.default,backward,8,1,1,1,5182,967,8
-5958,convert_element_type_1556,call_function,convert_element_type.default,backward,8,1,1,1,956,967,4
-5959,convert_element_type_1557,call_function,convert_element_type.default,backward,8,1,1,1,3,961,2
-5960,alias_default_1107,call_function,alias.default,backward,8,1,1,2,5183,966,4
-5961,mul_590,call_function,mul.Tensor,backward,8,2,2,1,5185,960,8
-5962,mul_591,call_function,mul.Tensor,backward,8,2,2,1,964,966,8
-5963,alias_default_1108,call_function,alias.default,backward,8,1,1,2,5186,959,4
-5964,alias_default_1109,call_function,alias.default,backward,8,1,1,3,965,965,4
-5965,mul_592,call_function,mul.Tensor,backward,8,2,2,1,5190,958,8
-5966,sum_79,call_function,sum.dim_IntList,backward,8,1,1,1,5191,957,5
-5967,div_67,call_function,div.Tensor,backward,8,1,1,1,966,957,6
-5968,mul_593,call_function,mul.Tensor,backward,8,2,2,1,5193,956,8
-5969,sub_59,call_function,sub.Tensor,backward,8,2,2,1,5194,955,10
-5970,mul_594,call_function,mul.Tensor,backward,8,2,2,1,5195,954,8
-5971,mul_595,call_function,mul.Tensor,backward,8,2,2,1,5187,4,8
-5972,sum_80,call_function,sum.dim_IntList,backward,8,1,1,1,5188,3,5
-5973,convert_element_type_1558,call_function,convert_element_type.default,backward,8,1,1,1,5196,953,6
-5974,convert_element_type_1559,call_function,convert_element_type.default,backward,8,1,1,1,5189,2,3
-5975,add_277,call_function,add.Tensor,unknown,,2,2,1,5197,952,10
-5976,dtype_cast_431,call_function,dtype_cast.default,backward,8,1,1,1,5190,1,3
-5977,alias_default_1327,call_function,alias.default,backward,8,1,1,0,5191,0,2
-5978,alias_default_1110,call_function,alias.default,unknown,,1,1,3,5198,951,4
-5979,einsum_default_471,call_function,einsum.default,backward,8,2,2,1,5199,3,5
-5980,permute_935,call_function,permute.default,backward,8,1,1,1,4,947,3
-5981,einsum_default_472,call_function,einsum.default,backward,8,2,2,1,5200,946,5
-5982,permute_936,call_function,permute.default,backward,8,1,1,1,5200,2,4
-5983,dtype_cast_432,call_function,dtype_cast.default,backward,8,1,1,1,5201,1,4
-5984,alias_default_1322,call_function,alias.default,backward,8,1,1,0,5202,0,3
-5985,view_1074,call_function,view.default,backward,8,1,1,1,5201,945,4
-5986,permute_937,call_function,permute.default,backward,8,1,1,1,5202,944,4
-5987,_scaled_dot_product_flash_attention_backward_19,call_function,_scaled_dot_product_flash_attention_backward.default,backward,8,8,8,3,5206,943,2
-5988,getitem_309,call_function,getitem,backward,8,1,1,1,5207,916,2
-5989,getitem_310,call_function,getitem,backward,8,1,1,1,5207,917,2
-5990,getitem_311,call_function,getitem,backward,8,1,1,1,5207,910,2
-5991,permute_938,call_function,permute.default,backward,8,1,1,1,5208,909,2
-5992,permute_939,call_function,permute.default,backward,8,1,1,1,5208,916,2
-5993,permute_940,call_function,permute.default,backward,8,1,1,1,5208,915,2
-5994,convert_element_type_1564,call_function,convert_element_type.default,backward,8,1,1,1,5209,915,2
-5995,convert_element_type_1565,call_function,convert_element_type.default,backward,8,1,1,1,5209,914,2
-5996,view_1075,call_function,view.default,backward,8,1,1,1,5210,914,2
-5997,view_as_complex_94,call_function,view_as_complex.default,backward,8,1,1,1,5211,913,6
-5998,_conj_38,call_function,_conj.default,backward,8,1,1,1,4,914,3
-5999,clone_158,call_function,clone.default,backward,8,1,1,1,5,913,3
-6000,mul_596,call_function,mul.Tensor,backward,8,2,2,1,5214,912,8
-6001,view_1076,call_function,view.default,backward,8,1,1,1,5210,913,2
-6002,view_as_complex_95,call_function,view_as_complex.default,backward,8,1,1,1,5211,912,6
-6003,_conj_39,call_function,_conj.default,backward,8,1,1,1,4,913,3
-6004,clone_159,call_function,clone.default,backward,8,1,1,1,5,912,3
-6005,mul_597,call_function,mul.Tensor,backward,8,2,2,1,5214,911,8
-6006,view_as_real_94,call_function,view_as_real.default,backward,8,1,1,1,5215,911,6
-6007,view_1077,call_function,view.default,backward,8,1,1,1,5216,910,6
-6008,convert_element_type_1566,call_function,convert_element_type.default,backward,8,1,1,1,5217,909,6
-6009,view_as_real_95,call_function,view_as_real.default,backward,8,1,1,1,5215,910,6
-6010,view_1078,call_function,view.default,backward,8,1,1,1,5216,909,6
-6011,convert_element_type_1567,call_function,convert_element_type.default,backward,8,1,1,1,5217,908,6
-6012,view_1079,call_function,view.default,backward,8,1,1,1,5209,908,2
-6013,view_1080,call_function,view.default,backward,8,1,1,1,5218,908,5
-6014,view_1081,call_function,view.default,backward,8,1,1,1,5218,907,5
-6015,alias_default_1111,call_function,alias.default,backward,8,1,1,2,5210,907,4
-6016,einsum_default_473,call_function,einsum.default,backward,8,2,2,1,5211,3,5
-6017,permute_943,call_function,permute.default,backward,8,1,1,1,4,903,3
-6018,einsum_default_474,call_function,einsum.default,backward,8,2,2,1,5212,902,5
-6019,permute_944,call_function,permute.default,backward,8,1,1,1,5212,2,4
-6020,dtype_cast_433,call_function,dtype_cast.default,backward,8,1,1,1,5213,1,4
-6021,alias_default_1321,call_function,alias.default,backward,8,1,1,0,5214,0,3
-6022,alias_default_1112,call_function,alias.default,backward,8,1,1,2,5219,907,4
-6023,einsum_default_475,call_function,einsum.default,backward,8,2,2,1,5220,3,5
-6024,permute_947,call_function,permute.default,backward,8,1,1,1,4,903,3
-6025,einsum_default_476,call_function,einsum.default,backward,8,2,2,1,5221,902,5
-6026,add_278,call_function,add.Tensor,unknown,,2,2,1,5228,901,10
-6027,permute_948,call_function,permute.default,backward,8,1,1,1,5221,2,4
-6028,dtype_cast_434,call_function,dtype_cast.default,backward,8,1,1,1,5222,1,4
-6029,alias_default_1320,call_function,alias.default,backward,8,1,1,0,5223,0,3
-6030,alias_default_1113,call_function,alias.default,backward,8,1,1,2,5219,906,4
-6031,einsum_default_477,call_function,einsum.default,backward,8,2,2,1,5220,3,5
-6032,permute_951,call_function,permute.default,backward,8,1,1,1,4,902,3
-6033,einsum_default_478,call_function,einsum.default,backward,8,2,2,1,5221,901,5
-6034,add_279,call_function,add.Tensor,unknown,,2,2,1,5244,900,10
-6035,permute_952,call_function,permute.default,backward,8,1,1,1,5221,2,4
-6036,dtype_cast_435,call_function,dtype_cast.default,backward,8,1,1,1,5222,1,4
-6037,alias_default_1319,call_function,alias.default,backward,8,1,1,0,5223,0,3
-6038,convert_element_type_1580,call_function,convert_element_type.default,backward,8,1,1,1,5245,899,8
-6039,convert_element_type_1581,call_function,convert_element_type.default,backward,8,1,1,1,889,899,4
-6040,convert_element_type_1582,call_function,convert_element_type.default,backward,8,1,1,1,3,893,2
-6041,alias_default_1114,call_function,alias.default,backward,8,1,1,2,5246,898,4
-6042,mul_598,call_function,mul.Tensor,backward,8,2,2,1,5248,892,8
-6043,mul_599,call_function,mul.Tensor,backward,8,2,2,1,897,898,8
-6044,alias_default_1115,call_function,alias.default,backward,8,1,1,2,5249,891,4
-6045,alias_default_1116,call_function,alias.default,backward,8,1,1,3,898,897,4
-6046,mul_600,call_function,mul.Tensor,backward,8,2,2,1,5253,890,8
-6047,sum_81,call_function,sum.dim_IntList,backward,8,1,1,1,5254,889,5
-6048,div_68,call_function,div.Tensor,backward,8,1,1,1,899,889,6
-6049,mul_601,call_function,mul.Tensor,backward,8,2,2,1,5256,888,8
-6050,sub_60,call_function,sub.Tensor,backward,8,2,2,1,5257,887,10
-6051,mul_602,call_function,mul.Tensor,backward,8,2,2,1,5258,886,8
-6052,mul_603,call_function,mul.Tensor,backward,8,2,2,1,5250,4,8
-6053,sum_82,call_function,sum.dim_IntList,backward,8,1,1,1,5251,3,5
-6054,convert_element_type_1583,call_function,convert_element_type.default,backward,8,1,1,1,5259,885,6
-6055,convert_element_type_1584,call_function,convert_element_type.default,backward,8,1,1,1,5252,2,3
-6056,add_280,call_function,add.Tensor,unknown,,2,2,1,5260,884,10
-6057,dtype_cast_436,call_function,dtype_cast.default,backward,8,1,1,1,5253,1,3
-6058,alias_default_1326,call_function,alias.default,backward,8,1,1,0,5254,0,2
-6059,alias_default_1117,call_function,alias.default,unknown,,1,1,3,5261,883,4
-6060,einsum_default_479,call_function,einsum.default,backward,7,2,2,1,5262,3,5
-6061,permute_955,call_function,permute.default,backward,7,1,1,1,4,879,3
-6062,einsum_default_480,call_function,einsum.default,backward,7,2,2,1,5263,878,5
-6063,permute_956,call_function,permute.default,backward,7,1,1,1,5263,2,4
-6064,dtype_cast_437,call_function,dtype_cast.default,backward,7,1,1,1,5264,1,4
-6065,alias_default_1315,call_function,alias.default,backward,7,1,1,0,5265,0,3
-6066,alias_default_1118,call_function,alias.default,backward,7,1,1,2,5264,877,4
-6067,mul_604,call_function,mul.Tensor,backward,7,2,2,1,5265,865,8
-6068,mul_605,call_function,mul.Tensor,backward,7,2,2,1,5265,869,8
-6069,alias_default_1119,call_function,alias.default,backward,7,1,1,2,5266,864,4
-6070,einsum_default_481,call_function,einsum.default,backward,7,2,2,1,5267,3,5
-6071,permute_959,call_function,permute.default,backward,7,1,1,1,4,860,3
-6072,einsum_default_482,call_function,einsum.default,backward,7,2,2,1,5268,859,5
-6073,permute_960,call_function,permute.default,backward,7,1,1,1,5268,2,4
-6074,dtype_cast_438,call_function,dtype_cast.default,backward,7,1,1,1,5269,1,4
-6075,alias_default_1316,call_function,alias.default,backward,7,1,1,0,5270,0,3
-6076,convert_element_type_1593,call_function,convert_element_type.default,backward,7,1,1,1,5266,868,6
-6077,convert_element_type_1594,call_function,convert_element_type.default,backward,7,1,1,1,866,878,4
-6078,alias_default_1120,call_function,alias.default,backward,7,1,1,2,867,877,4
-6079,neg_48,call_function,neg.default,backward,7,1,1,1,868,876,8
-6080,exp_48,call_function,exp.default,backward,7,1,1,1,869,875,6
-6081,add_281,call_function,add.Tensor,backward,7,1,1,1,870,874,4
-6082,reciprocal_20,call_function,reciprocal.default,backward,7,1,1,1,871,873,4
-6083,mul_606,call_function,mul.Tensor,backward,7,1,1,1,872,872,6
-6084,alias_default_1121,call_function,alias.default,backward,7,1,1,2,873,871,4
-6085,mul_607,call_function,mul.Tensor,backward,7,2,2,1,5275,867,8
-6086,sub_61,call_function,sub.Tensor,backward,7,1,1,1,874,869,4
-6087,mul_608,call_function,mul.Tensor,backward,7,2,2,1,875,868,8
-6088,add_282,call_function,add.Tensor,backward,7,1,1,1,876,867,4
-6089,mul_609,call_function,mul.Tensor,backward,7,2,2,1,5279,866,8
-6090,convert_element_type_1595,call_function,convert_element_type.default,backward,7,1,1,1,5280,865,6
-6091,alias_default_1122,call_function,alias.default,backward,7,1,1,2,5281,864,4
-6092,einsum_default_483,call_function,einsum.default,backward,7,2,2,1,5282,3,5
-6093,permute_963,call_function,permute.default,backward,7,1,1,1,4,860,3
-6094,einsum_default_484,call_function,einsum.default,backward,7,2,2,1,5283,859,5
-6095,add_283,call_function,add.Tensor,unknown,,2,2,1,5288,858,10
-6096,permute_964,call_function,permute.default,backward,7,1,1,1,5283,2,4
-6097,dtype_cast_439,call_function,dtype_cast.default,backward,7,1,1,1,5284,1,4
-6098,alias_default_1314,call_function,alias.default,backward,7,1,1,0,5285,0,3
-6099,convert_element_type_1600,call_function,convert_element_type.default,backward,7,1,1,1,5289,857,8
-6100,convert_element_type_1601,call_function,convert_element_type.default,backward,7,1,1,1,846,857,4
-6101,convert_element_type_1602,call_function,convert_element_type.default,backward,7,1,1,1,3,851,2
-6102,alias_default_1123,call_function,alias.default,backward,7,1,1,2,5290,856,4
-6103,mul_610,call_function,mul.Tensor,backward,7,2,2,1,5292,850,8
-6104,mul_611,call_function,mul.Tensor,backward,7,2,2,1,854,856,8
-6105,alias_default_1124,call_function,alias.default,backward,7,1,1,2,5293,849,4
-6106,alias_default_1125,call_function,alias.default,backward,7,1,1,3,855,855,4
-6107,mul_612,call_function,mul.Tensor,backward,7,2,2,1,5297,848,8
-6108,sum_83,call_function,sum.dim_IntList,backward,7,1,1,1,5298,847,5
-6109,div_69,call_function,div.Tensor,backward,7,1,1,1,856,847,6
-6110,mul_613,call_function,mul.Tensor,backward,7,2,2,1,5300,846,8
-6111,sub_62,call_function,sub.Tensor,backward,7,2,2,1,5301,845,10
-6112,mul_614,call_function,mul.Tensor,backward,7,2,2,1,5302,844,8
-6113,mul_615,call_function,mul.Tensor,backward,7,2,2,1,5294,4,8
-6114,sum_84,call_function,sum.dim_IntList,backward,7,1,1,1,5295,3,5
-6115,convert_element_type_1603,call_function,convert_element_type.default,backward,7,1,1,1,5303,843,6
-6116,convert_element_type_1604,call_function,convert_element_type.default,backward,7,1,1,1,5296,2,3
-6117,add_284,call_function,add.Tensor,unknown,,2,2,1,5304,842,10
-6118,dtype_cast_440,call_function,dtype_cast.default,backward,7,1,1,1,5297,1,3
-6119,alias_default_1318,call_function,alias.default,backward,7,1,1,0,5298,0,2
-6120,alias_default_1126,call_function,alias.default,unknown,,1,1,3,5305,841,4
-6121,einsum_default_485,call_function,einsum.default,backward,7,2,2,1,5306,3,5
-6122,permute_967,call_function,permute.default,backward,7,1,1,1,4,837,3
-6123,einsum_default_486,call_function,einsum.default,backward,7,2,2,1,5307,836,5
-6124,permute_968,call_function,permute.default,backward,7,1,1,1,5307,2,4
-6125,dtype_cast_441,call_function,dtype_cast.default,backward,7,1,1,1,5308,1,4
-6126,alias_default_1313,call_function,alias.default,backward,7,1,1,0,5309,0,3
-6127,view_1096,call_function,view.default,backward,7,1,1,1,5308,835,4
-6128,permute_969,call_function,permute.default,backward,7,1,1,1,5309,834,4
-6129,_scaled_dot_product_flash_attention_backward_20,call_function,_scaled_dot_product_flash_attention_backward.default,backward,7,8,8,3,5313,833,2
-6130,getitem_312,call_function,getitem,backward,7,1,1,1,5314,806,2
-6131,getitem_313,call_function,getitem,backward,7,1,1,1,5314,807,2
-6132,getitem_314,call_function,getitem,backward,7,1,1,1,5314,800,2
-6133,permute_970,call_function,permute.default,backward,7,1,1,1,5315,799,2
-6134,permute_971,call_function,permute.default,backward,7,1,1,1,5315,806,2
-6135,permute_972,call_function,permute.default,backward,7,1,1,1,5315,805,2
-6136,convert_element_type_1609,call_function,convert_element_type.default,backward,7,1,1,1,5316,805,2
-6137,convert_element_type_1610,call_function,convert_element_type.default,backward,7,1,1,1,5316,804,2
-6138,view_1097,call_function,view.default,backward,7,1,1,1,5317,804,2
-6139,view_as_complex_96,call_function,view_as_complex.default,backward,7,1,1,1,5318,803,6
-6140,_conj_40,call_function,_conj.default,backward,7,1,1,1,4,804,3
-6141,clone_166,call_function,clone.default,backward,7,1,1,1,5,803,3
-6142,mul_616,call_function,mul.Tensor,backward,7,2,2,1,5321,802,8
-6143,view_1098,call_function,view.default,backward,7,1,1,1,5317,803,2
-6144,view_as_complex_97,call_function,view_as_complex.default,backward,7,1,1,1,5318,802,6
-6145,_conj_41,call_function,_conj.default,backward,7,1,1,1,4,803,3
-6146,clone_167,call_function,clone.default,backward,7,1,1,1,5,802,3
-6147,mul_617,call_function,mul.Tensor,backward,7,2,2,1,5321,801,8
-6148,view_as_real_96,call_function,view_as_real.default,backward,7,1,1,1,5322,801,6
-6149,view_1099,call_function,view.default,backward,7,1,1,1,5323,800,6
-6150,convert_element_type_1611,call_function,convert_element_type.default,backward,7,1,1,1,5324,799,6
-6151,view_as_real_97,call_function,view_as_real.default,backward,7,1,1,1,5322,800,6
-6152,view_1100,call_function,view.default,backward,7,1,1,1,5323,799,6
-6153,convert_element_type_1612,call_function,convert_element_type.default,backward,7,1,1,1,5324,798,6
-6154,view_1101,call_function,view.default,backward,7,1,1,1,5316,798,2
-6155,view_1102,call_function,view.default,backward,7,1,1,1,5325,798,5
-6156,view_1103,call_function,view.default,backward,7,1,1,1,5325,797,5
-6157,alias_default_1127,call_function,alias.default,backward,7,1,1,2,5317,797,4
-6158,einsum_default_487,call_function,einsum.default,backward,7,2,2,1,5318,3,5
-6159,permute_975,call_function,permute.default,backward,7,1,1,1,4,793,3
-6160,einsum_default_488,call_function,einsum.default,backward,7,2,2,1,5319,792,5
-6161,permute_976,call_function,permute.default,backward,7,1,1,1,5319,2,4
-6162,dtype_cast_442,call_function,dtype_cast.default,backward,7,1,1,1,5320,1,4
-6163,alias_default_1312,call_function,alias.default,backward,7,1,1,0,5321,0,3
-6164,alias_default_1128,call_function,alias.default,backward,7,1,1,2,5326,797,4
-6165,einsum_default_489,call_function,einsum.default,backward,7,2,2,1,5327,3,5
-6166,permute_979,call_function,permute.default,backward,7,1,1,1,4,793,3
-6167,einsum_default_490,call_function,einsum.default,backward,7,2,2,1,5328,792,5
-6168,add_285,call_function,add.Tensor,unknown,,2,2,1,5335,791,10
-6169,permute_980,call_function,permute.default,backward,7,1,1,1,5328,2,4
-6170,dtype_cast_443,call_function,dtype_cast.default,backward,7,1,1,1,5329,1,4
-6171,alias_default_1311,call_function,alias.default,backward,7,1,1,0,5330,0,3
-6172,alias_default_1129,call_function,alias.default,backward,7,1,1,2,5326,796,4
-6173,einsum_default_491,call_function,einsum.default,backward,7,2,2,1,5327,3,5
-6174,permute_983,call_function,permute.default,backward,7,1,1,1,4,792,3
-6175,einsum_default_492,call_function,einsum.default,backward,7,2,2,1,5328,791,5
-6176,add_286,call_function,add.Tensor,unknown,,2,2,1,5351,790,10
-6177,permute_984,call_function,permute.default,backward,7,1,1,1,5328,2,4
-6178,dtype_cast_444,call_function,dtype_cast.default,backward,7,1,1,1,5329,1,4
-6179,alias_default_1310,call_function,alias.default,backward,7,1,1,0,5330,0,3
-6180,convert_element_type_1625,call_function,convert_element_type.default,backward,7,1,1,1,5352,789,8
-6181,convert_element_type_1626,call_function,convert_element_type.default,backward,7,1,1,1,779,789,4
-6182,convert_element_type_1627,call_function,convert_element_type.default,backward,7,1,1,1,3,783,2
-6183,alias_default_1130,call_function,alias.default,backward,7,1,1,2,5353,788,4
-6184,mul_618,call_function,mul.Tensor,backward,7,2,2,1,5355,782,8
-6185,mul_619,call_function,mul.Tensor,backward,7,2,2,1,787,788,8
-6186,alias_default_1131,call_function,alias.default,backward,7,1,1,2,5356,781,4
-6187,alias_default_1132,call_function,alias.default,backward,7,1,1,3,788,787,4
-6188,mul_620,call_function,mul.Tensor,backward,7,2,2,1,5360,780,8
-6189,sum_85,call_function,sum.dim_IntList,backward,7,1,1,1,5361,779,5
-6190,div_70,call_function,div.Tensor,backward,7,1,1,1,789,779,6
-6191,mul_621,call_function,mul.Tensor,backward,7,2,2,1,5363,778,8
-6192,sub_63,call_function,sub.Tensor,backward,7,2,2,1,5364,777,10
-6193,mul_622,call_function,mul.Tensor,backward,7,2,2,1,5365,776,8
-6194,mul_623,call_function,mul.Tensor,backward,7,2,2,1,5357,4,8
-6195,sum_86,call_function,sum.dim_IntList,backward,7,1,1,1,5358,3,5
-6196,convert_element_type_1628,call_function,convert_element_type.default,backward,7,1,1,1,5366,775,6
-6197,convert_element_type_1629,call_function,convert_element_type.default,backward,7,1,1,1,5359,2,3
-6198,add_287,call_function,add.Tensor,unknown,,2,2,1,5367,774,10
-6199,dtype_cast_445,call_function,dtype_cast.default,backward,7,1,1,1,5360,1,3
-6200,alias_default_1317,call_function,alias.default,backward,7,1,1,0,5361,0,2
-6201,alias_default_1133,call_function,alias.default,unknown,,1,1,3,5368,773,4
-6202,einsum_default_493,call_function,einsum.default,backward,6,2,2,1,5369,3,5
-6203,permute_987,call_function,permute.default,backward,6,1,1,1,4,769,3
-6204,einsum_default_494,call_function,einsum.default,backward,6,2,2,1,5370,768,5
-6205,permute_988,call_function,permute.default,backward,6,1,1,1,5370,2,4
-6206,dtype_cast_446,call_function,dtype_cast.default,backward,6,1,1,1,5371,1,4
-6207,alias_default_1306,call_function,alias.default,backward,6,1,1,0,5372,0,3
-6208,alias_default_1134,call_function,alias.default,backward,6,1,1,2,5371,767,4
-6209,mul_624,call_function,mul.Tensor,backward,6,2,2,1,5372,755,8
-6210,mul_625,call_function,mul.Tensor,backward,6,2,2,1,5372,759,8
-6211,alias_default_1135,call_function,alias.default,backward,6,1,1,2,5373,754,4
-6212,einsum_default_495,call_function,einsum.default,backward,6,2,2,1,5374,3,5
-6213,permute_991,call_function,permute.default,backward,6,1,1,1,4,750,3
-6214,einsum_default_496,call_function,einsum.default,backward,6,2,2,1,5375,749,5
-6215,permute_992,call_function,permute.default,backward,6,1,1,1,5375,2,4
-6216,dtype_cast_447,call_function,dtype_cast.default,backward,6,1,1,1,5376,1,4
-6217,alias_default_1307,call_function,alias.default,backward,6,1,1,0,5377,0,3
-6218,convert_element_type_1638,call_function,convert_element_type.default,backward,6,1,1,1,5373,758,6
-6219,convert_element_type_1639,call_function,convert_element_type.default,backward,6,1,1,1,756,768,4
-6220,alias_default_1136,call_function,alias.default,backward,6,1,1,2,757,767,4
-6221,neg_49,call_function,neg.default,backward,6,1,1,1,758,766,8
-6222,exp_49,call_function,exp.default,backward,6,1,1,1,759,765,6
-6223,add_288,call_function,add.Tensor,backward,6,1,1,1,760,764,4
-6224,reciprocal_21,call_function,reciprocal.default,backward,6,1,1,1,761,763,4
-6225,mul_626,call_function,mul.Tensor,backward,6,1,1,1,762,762,6
-6226,alias_default_1137,call_function,alias.default,backward,6,1,1,2,763,761,4
-6227,mul_627,call_function,mul.Tensor,backward,6,2,2,1,5382,757,8
-6228,sub_64,call_function,sub.Tensor,backward,6,1,1,1,764,759,4
-6229,mul_628,call_function,mul.Tensor,backward,6,2,2,1,765,758,8
-6230,add_289,call_function,add.Tensor,backward,6,1,1,1,766,757,4
-6231,mul_629,call_function,mul.Tensor,backward,6,2,2,1,5386,756,8
-6232,convert_element_type_1640,call_function,convert_element_type.default,backward,6,1,1,1,5387,755,6
-6233,alias_default_1138,call_function,alias.default,backward,6,1,1,2,5388,754,4
-6234,einsum_default_497,call_function,einsum.default,backward,6,2,2,1,5389,3,5
-6235,permute_995,call_function,permute.default,backward,6,1,1,1,4,750,3
-6236,einsum_default_498,call_function,einsum.default,backward,6,2,2,1,5390,749,5
-6237,add_290,call_function,add.Tensor,unknown,,2,2,1,5395,748,10
-6238,permute_996,call_function,permute.default,backward,6,1,1,1,5390,2,4
-6239,dtype_cast_448,call_function,dtype_cast.default,backward,6,1,1,1,5391,1,4
-6240,alias_default_1305,call_function,alias.default,backward,6,1,1,0,5392,0,3
-6241,convert_element_type_1645,call_function,convert_element_type.default,backward,6,1,1,1,5396,747,8
-6242,convert_element_type_1646,call_function,convert_element_type.default,backward,6,1,1,1,736,747,4
-6243,convert_element_type_1647,call_function,convert_element_type.default,backward,6,1,1,1,3,741,2
-6244,alias_default_1139,call_function,alias.default,backward,6,1,1,2,5397,746,4
-6245,mul_630,call_function,mul.Tensor,backward,6,2,2,1,5399,740,8
-6246,mul_631,call_function,mul.Tensor,backward,6,2,2,1,744,746,8
-6247,alias_default_1140,call_function,alias.default,backward,6,1,1,2,5400,739,4
-6248,alias_default_1141,call_function,alias.default,backward,6,1,1,3,745,745,4
-6249,mul_632,call_function,mul.Tensor,backward,6,2,2,1,5404,738,8
-6250,sum_87,call_function,sum.dim_IntList,backward,6,1,1,1,5405,737,5
-6251,div_71,call_function,div.Tensor,backward,6,1,1,1,746,737,6
-6252,mul_633,call_function,mul.Tensor,backward,6,2,2,1,5407,736,8
-6253,sub_65,call_function,sub.Tensor,backward,6,2,2,1,5408,735,10
-6254,mul_634,call_function,mul.Tensor,backward,6,2,2,1,5409,734,8
-6255,mul_635,call_function,mul.Tensor,backward,6,2,2,1,5401,4,8
-6256,sum_88,call_function,sum.dim_IntList,backward,6,1,1,1,5402,3,5
-6257,convert_element_type_1648,call_function,convert_element_type.default,backward,6,1,1,1,5410,733,6
-6258,convert_element_type_1649,call_function,convert_element_type.default,backward,6,1,1,1,5403,2,3
-6259,add_291,call_function,add.Tensor,unknown,,2,2,1,5411,732,10
-6260,dtype_cast_449,call_function,dtype_cast.default,backward,6,1,1,1,5404,1,3
-6261,alias_default_1309,call_function,alias.default,backward,6,1,1,0,5405,0,2
-6262,alias_default_1142,call_function,alias.default,unknown,,1,1,3,5412,731,4
-6263,einsum_default_499,call_function,einsum.default,backward,6,2,2,1,5413,3,5
-6264,permute_999,call_function,permute.default,backward,6,1,1,1,4,727,3
-6265,einsum_default_500,call_function,einsum.default,backward,6,2,2,1,5414,726,5
-6266,permute_1000,call_function,permute.default,backward,6,1,1,1,5414,2,4
-6267,dtype_cast_450,call_function,dtype_cast.default,backward,6,1,1,1,5415,1,4
-6268,alias_default_1304,call_function,alias.default,backward,6,1,1,0,5416,0,3
-6269,view_1118,call_function,view.default,backward,6,1,1,1,5415,725,4
-6270,permute_1001,call_function,permute.default,backward,6,1,1,1,5416,724,4
-6271,_scaled_dot_product_flash_attention_backward_21,call_function,_scaled_dot_product_flash_attention_backward.default,backward,6,8,8,3,5420,723,2
-6272,getitem_315,call_function,getitem,backward,6,1,1,1,5421,696,2
-6273,getitem_316,call_function,getitem,backward,6,1,1,1,5421,697,2
-6274,getitem_317,call_function,getitem,backward,6,1,1,1,5421,690,2
-6275,permute_1002,call_function,permute.default,backward,6,1,1,1,5422,689,2
-6276,permute_1003,call_function,permute.default,backward,6,1,1,1,5422,696,2
-6277,permute_1004,call_function,permute.default,backward,6,1,1,1,5422,695,2
-6278,convert_element_type_1654,call_function,convert_element_type.default,backward,6,1,1,1,5423,695,2
-6279,convert_element_type_1655,call_function,convert_element_type.default,backward,6,1,1,1,5423,694,2
-6280,view_1119,call_function,view.default,backward,6,1,1,1,5424,694,2
-6281,view_as_complex_98,call_function,view_as_complex.default,backward,6,1,1,1,5425,693,6
-6282,_conj_42,call_function,_conj.default,backward,6,1,1,1,4,694,3
-6283,clone_174,call_function,clone.default,backward,6,1,1,1,5,693,3
-6284,mul_636,call_function,mul.Tensor,backward,6,2,2,1,5428,692,8
-6285,view_1120,call_function,view.default,backward,6,1,1,1,5424,693,2
-6286,view_as_complex_99,call_function,view_as_complex.default,backward,6,1,1,1,5425,692,6
-6287,_conj_43,call_function,_conj.default,backward,6,1,1,1,4,693,3
-6288,clone_175,call_function,clone.default,backward,6,1,1,1,5,692,3
-6289,mul_637,call_function,mul.Tensor,backward,6,2,2,1,5428,691,8
-6290,view_as_real_98,call_function,view_as_real.default,backward,6,1,1,1,5429,691,6
-6291,view_1121,call_function,view.default,backward,6,1,1,1,5430,690,6
-6292,convert_element_type_1656,call_function,convert_element_type.default,backward,6,1,1,1,5431,689,6
-6293,view_as_real_99,call_function,view_as_real.default,backward,6,1,1,1,5429,690,6
-6294,view_1122,call_function,view.default,backward,6,1,1,1,5430,689,6
-6295,convert_element_type_1657,call_function,convert_element_type.default,backward,6,1,1,1,5431,688,6
-6296,view_1123,call_function,view.default,backward,6,1,1,1,5423,688,2
-6297,view_1124,call_function,view.default,backward,6,1,1,1,5432,688,5
-6298,view_1125,call_function,view.default,backward,6,1,1,1,5432,687,5
-6299,alias_default_1143,call_function,alias.default,backward,6,1,1,2,5424,687,4
-6300,einsum_default_501,call_function,einsum.default,backward,6,2,2,1,5425,3,5
-6301,permute_1007,call_function,permute.default,backward,6,1,1,1,4,683,3
-6302,einsum_default_502,call_function,einsum.default,backward,6,2,2,1,5426,682,5
-6303,permute_1008,call_function,permute.default,backward,6,1,1,1,5426,2,4
-6304,dtype_cast_451,call_function,dtype_cast.default,backward,6,1,1,1,5427,1,4
-6305,alias_default_1303,call_function,alias.default,backward,6,1,1,0,5428,0,3
-6306,alias_default_1144,call_function,alias.default,backward,6,1,1,2,5433,687,4
-6307,einsum_default_503,call_function,einsum.default,backward,6,2,2,1,5434,3,5
-6308,permute_1011,call_function,permute.default,backward,6,1,1,1,4,683,3
-6309,einsum_default_504,call_function,einsum.default,backward,6,2,2,1,5435,682,5
-6310,add_292,call_function,add.Tensor,unknown,,2,2,1,5442,681,10
-6311,permute_1012,call_function,permute.default,backward,6,1,1,1,5435,2,4
-6312,dtype_cast_452,call_function,dtype_cast.default,backward,6,1,1,1,5436,1,4
-6313,alias_default_1302,call_function,alias.default,backward,6,1,1,0,5437,0,3
-6314,alias_default_1145,call_function,alias.default,backward,6,1,1,2,5433,686,4
-6315,einsum_default_505,call_function,einsum.default,backward,6,2,2,1,5434,3,5
-6316,permute_1015,call_function,permute.default,backward,6,1,1,1,4,682,3
-6317,einsum_default_506,call_function,einsum.default,backward,6,2,2,1,5435,681,5
-6318,add_293,call_function,add.Tensor,unknown,,2,2,1,5458,680,10
-6319,permute_1016,call_function,permute.default,backward,6,1,1,1,5435,2,4
-6320,dtype_cast_453,call_function,dtype_cast.default,backward,6,1,1,1,5436,1,4
-6321,alias_default_1301,call_function,alias.default,backward,6,1,1,0,5437,0,3
-6322,convert_element_type_1670,call_function,convert_element_type.default,backward,6,1,1,1,5459,679,8
-6323,convert_element_type_1671,call_function,convert_element_type.default,backward,6,1,1,1,669,679,4
-6324,convert_element_type_1672,call_function,convert_element_type.default,backward,6,1,1,1,3,673,2
-6325,alias_default_1146,call_function,alias.default,backward,6,1,1,2,5460,678,4
-6326,mul_638,call_function,mul.Tensor,backward,6,2,2,1,5462,672,8
-6327,mul_639,call_function,mul.Tensor,backward,6,2,2,1,677,678,8
-6328,alias_default_1147,call_function,alias.default,backward,6,1,1,2,5463,671,4
-6329,alias_default_1148,call_function,alias.default,backward,6,1,1,3,678,677,4
-6330,mul_640,call_function,mul.Tensor,backward,6,2,2,1,5467,670,8
-6331,sum_89,call_function,sum.dim_IntList,backward,6,1,1,1,5468,669,5
-6332,div_72,call_function,div.Tensor,backward,6,1,1,1,679,669,6
-6333,mul_641,call_function,mul.Tensor,backward,6,2,2,1,5470,668,8
-6334,sub_66,call_function,sub.Tensor,backward,6,2,2,1,5471,667,10
-6335,mul_642,call_function,mul.Tensor,backward,6,2,2,1,5472,666,8
-6336,mul_643,call_function,mul.Tensor,backward,6,2,2,1,5464,4,8
-6337,sum_90,call_function,sum.dim_IntList,backward,6,1,1,1,5465,3,5
-6338,convert_element_type_1673,call_function,convert_element_type.default,backward,6,1,1,1,5473,665,6
-6339,convert_element_type_1674,call_function,convert_element_type.default,backward,6,1,1,1,5466,2,3
-6340,add_294,call_function,add.Tensor,unknown,,2,2,1,5474,664,10
-6341,dtype_cast_454,call_function,dtype_cast.default,backward,6,1,1,1,5467,1,3
-6342,alias_default_1308,call_function,alias.default,backward,6,1,1,0,5468,0,2
-6343,alias_default_1149,call_function,alias.default,unknown,,1,1,3,5475,663,4
-6344,einsum_default_507,call_function,einsum.default,backward,5,2,2,1,5476,3,5
-6345,permute_1019,call_function,permute.default,backward,5,1,1,1,4,659,3
-6346,einsum_default_508,call_function,einsum.default,backward,5,2,2,1,5477,658,5
-6347,permute_1020,call_function,permute.default,backward,5,1,1,1,5477,2,4
-6348,dtype_cast_455,call_function,dtype_cast.default,backward,5,1,1,1,5478,1,4
-6349,alias_default_1297,call_function,alias.default,backward,5,1,1,0,5479,0,3
-6350,alias_default_1150,call_function,alias.default,backward,5,1,1,2,5478,657,4
-6351,mul_644,call_function,mul.Tensor,backward,5,2,2,1,5479,645,8
-6352,mul_645,call_function,mul.Tensor,backward,5,2,2,1,5479,649,8
-6353,alias_default_1151,call_function,alias.default,backward,5,1,1,2,5480,644,4
-6354,einsum_default_509,call_function,einsum.default,backward,5,2,2,1,5481,3,5
-6355,permute_1023,call_function,permute.default,backward,5,1,1,1,4,640,3
-6356,einsum_default_510,call_function,einsum.default,backward,5,2,2,1,5482,639,5
-6357,permute_1024,call_function,permute.default,backward,5,1,1,1,5482,2,4
-6358,dtype_cast_456,call_function,dtype_cast.default,backward,5,1,1,1,5483,1,4
-6359,alias_default_1298,call_function,alias.default,backward,5,1,1,0,5484,0,3
-6360,convert_element_type_1683,call_function,convert_element_type.default,backward,5,1,1,1,5480,648,6
-6361,convert_element_type_1684,call_function,convert_element_type.default,backward,5,1,1,1,646,658,4
-6362,alias_default_1152,call_function,alias.default,backward,5,1,1,2,647,657,4
-6363,neg_50,call_function,neg.default,backward,5,1,1,1,648,656,8
-6364,exp_50,call_function,exp.default,backward,5,1,1,1,649,655,6
-6365,add_295,call_function,add.Tensor,backward,5,1,1,1,650,654,4
-6366,reciprocal_22,call_function,reciprocal.default,backward,5,1,1,1,651,653,4
-6367,mul_646,call_function,mul.Tensor,backward,5,1,1,1,652,652,6
-6368,alias_default_1153,call_function,alias.default,backward,5,1,1,2,653,651,4
-6369,mul_647,call_function,mul.Tensor,backward,5,2,2,1,5489,647,8
-6370,sub_67,call_function,sub.Tensor,backward,5,1,1,1,654,649,4
-6371,mul_648,call_function,mul.Tensor,backward,5,2,2,1,655,648,8
-6372,add_296,call_function,add.Tensor,backward,5,1,1,1,656,647,4
-6373,mul_649,call_function,mul.Tensor,backward,5,2,2,1,5493,646,8
-6374,convert_element_type_1685,call_function,convert_element_type.default,backward,5,1,1,1,5494,645,6
-6375,alias_default_1154,call_function,alias.default,backward,5,1,1,2,5495,644,4
-6376,einsum_default_511,call_function,einsum.default,backward,5,2,2,1,5496,3,5
-6377,permute_1027,call_function,permute.default,backward,5,1,1,1,4,640,3
-6378,einsum_default_512,call_function,einsum.default,backward,5,2,2,1,5497,639,5
-6379,add_297,call_function,add.Tensor,unknown,,2,2,1,5502,638,10
-6380,permute_1028,call_function,permute.default,backward,5,1,1,1,5497,2,4
-6381,dtype_cast_457,call_function,dtype_cast.default,backward,5,1,1,1,5498,1,4
-6382,alias_default_1296,call_function,alias.default,backward,5,1,1,0,5499,0,3
-6383,convert_element_type_1690,call_function,convert_element_type.default,backward,5,1,1,1,5503,637,8
-6384,convert_element_type_1691,call_function,convert_element_type.default,backward,5,1,1,1,626,637,4
-6385,convert_element_type_1692,call_function,convert_element_type.default,backward,5,1,1,1,3,631,2
-6386,alias_default_1155,call_function,alias.default,backward,5,1,1,2,5504,636,4
-6387,mul_650,call_function,mul.Tensor,backward,5,2,2,1,5506,630,8
-6388,mul_651,call_function,mul.Tensor,backward,5,2,2,1,634,636,8
-6389,alias_default_1156,call_function,alias.default,backward,5,1,1,2,5507,629,4
-6390,alias_default_1157,call_function,alias.default,backward,5,1,1,3,635,635,4
-6391,mul_652,call_function,mul.Tensor,backward,5,2,2,1,5511,628,8
-6392,sum_91,call_function,sum.dim_IntList,backward,5,1,1,1,5512,627,5
-6393,div_73,call_function,div.Tensor,backward,5,1,1,1,636,627,6
-6394,mul_653,call_function,mul.Tensor,backward,5,2,2,1,5514,626,8
-6395,sub_68,call_function,sub.Tensor,backward,5,2,2,1,5515,625,10
-6396,mul_654,call_function,mul.Tensor,backward,5,2,2,1,5516,624,8
-6397,mul_655,call_function,mul.Tensor,backward,5,2,2,1,5508,4,8
-6398,sum_92,call_function,sum.dim_IntList,backward,5,1,1,1,5509,3,5
-6399,convert_element_type_1693,call_function,convert_element_type.default,backward,5,1,1,1,5517,623,6
-6400,convert_element_type_1694,call_function,convert_element_type.default,backward,5,1,1,1,5510,2,3
-6401,add_298,call_function,add.Tensor,unknown,,2,2,1,5518,622,10
-6402,dtype_cast_458,call_function,dtype_cast.default,backward,5,1,1,1,5511,1,3
-6403,alias_default_1300,call_function,alias.default,backward,5,1,1,0,5512,0,2
-6404,alias_default_1158,call_function,alias.default,unknown,,1,1,3,5519,621,4
-6405,einsum_default_513,call_function,einsum.default,backward,5,2,2,1,5520,3,5
-6406,permute_1031,call_function,permute.default,backward,5,1,1,1,4,617,3
-6407,einsum_default_514,call_function,einsum.default,backward,5,2,2,1,5521,616,5
-6408,permute_1032,call_function,permute.default,backward,5,1,1,1,5521,2,4
-6409,dtype_cast_459,call_function,dtype_cast.default,backward,5,1,1,1,5522,1,4
-6410,alias_default_1295,call_function,alias.default,backward,5,1,1,0,5523,0,3
-6411,view_1140,call_function,view.default,backward,5,1,1,1,5522,615,4
-6412,permute_1033,call_function,permute.default,backward,5,1,1,1,5523,614,4
-6413,_scaled_dot_product_flash_attention_backward_22,call_function,_scaled_dot_product_flash_attention_backward.default,backward,5,8,8,3,5527,613,2
-6414,getitem_318,call_function,getitem,backward,5,1,1,1,5528,586,2
-6415,getitem_319,call_function,getitem,backward,5,1,1,1,5528,587,2
-6416,getitem_320,call_function,getitem,backward,5,1,1,1,5528,580,2
-6417,permute_1034,call_function,permute.default,backward,5,1,1,1,5529,579,2
-6418,permute_1035,call_function,permute.default,backward,5,1,1,1,5529,586,2
-6419,permute_1036,call_function,permute.default,backward,5,1,1,1,5529,585,2
-6420,convert_element_type_1699,call_function,convert_element_type.default,backward,5,1,1,1,5530,585,2
-6421,convert_element_type_1700,call_function,convert_element_type.default,backward,5,1,1,1,5530,584,2
-6422,view_1141,call_function,view.default,backward,5,1,1,1,5531,584,2
-6423,view_as_complex_100,call_function,view_as_complex.default,backward,5,1,1,1,5532,583,6
-6424,_conj_44,call_function,_conj.default,backward,5,1,1,1,4,584,3
-6425,clone_182,call_function,clone.default,backward,5,1,1,1,5,583,3
-6426,mul_656,call_function,mul.Tensor,backward,5,2,2,1,5535,582,8
-6427,view_1142,call_function,view.default,backward,5,1,1,1,5531,583,2
-6428,view_as_complex_101,call_function,view_as_complex.default,backward,5,1,1,1,5532,582,6
-6429,_conj_45,call_function,_conj.default,backward,5,1,1,1,4,583,3
-6430,clone_183,call_function,clone.default,backward,5,1,1,1,5,582,3
-6431,mul_657,call_function,mul.Tensor,backward,5,2,2,1,5535,581,8
-6432,view_as_real_100,call_function,view_as_real.default,backward,5,1,1,1,5536,581,6
-6433,view_1143,call_function,view.default,backward,5,1,1,1,5537,580,6
-6434,convert_element_type_1701,call_function,convert_element_type.default,backward,5,1,1,1,5538,579,6
-6435,view_as_real_101,call_function,view_as_real.default,backward,5,1,1,1,5536,580,6
-6436,view_1144,call_function,view.default,backward,5,1,1,1,5537,579,6
-6437,convert_element_type_1702,call_function,convert_element_type.default,backward,5,1,1,1,5538,578,6
-6438,view_1145,call_function,view.default,backward,5,1,1,1,5530,578,2
-6439,view_1146,call_function,view.default,backward,5,1,1,1,5539,578,5
-6440,view_1147,call_function,view.default,backward,5,1,1,1,5539,577,5
-6441,alias_default_1159,call_function,alias.default,backward,5,1,1,2,5531,577,4
-6442,einsum_default_515,call_function,einsum.default,backward,5,2,2,1,5532,3,5
-6443,permute_1039,call_function,permute.default,backward,5,1,1,1,4,573,3
-6444,einsum_default_516,call_function,einsum.default,backward,5,2,2,1,5533,572,5
-6445,permute_1040,call_function,permute.default,backward,5,1,1,1,5533,2,4
-6446,dtype_cast_460,call_function,dtype_cast.default,backward,5,1,1,1,5534,1,4
-6447,alias_default_1294,call_function,alias.default,backward,5,1,1,0,5535,0,3
-6448,alias_default_1160,call_function,alias.default,backward,5,1,1,2,5540,577,4
-6449,einsum_default_517,call_function,einsum.default,backward,5,2,2,1,5541,3,5
-6450,permute_1043,call_function,permute.default,backward,5,1,1,1,4,573,3
-6451,einsum_default_518,call_function,einsum.default,backward,5,2,2,1,5542,572,5
-6452,add_299,call_function,add.Tensor,unknown,,2,2,1,5549,571,10
-6453,permute_1044,call_function,permute.default,backward,5,1,1,1,5542,2,4
-6454,dtype_cast_461,call_function,dtype_cast.default,backward,5,1,1,1,5543,1,4
-6455,alias_default_1293,call_function,alias.default,backward,5,1,1,0,5544,0,3
-6456,alias_default_1161,call_function,alias.default,backward,5,1,1,2,5540,576,4
-6457,einsum_default_519,call_function,einsum.default,backward,5,2,2,1,5541,3,5
-6458,permute_1047,call_function,permute.default,backward,5,1,1,1,4,572,3
-6459,einsum_default_520,call_function,einsum.default,backward,5,2,2,1,5542,571,5
-6460,add_300,call_function,add.Tensor,unknown,,2,2,1,5565,570,10
-6461,permute_1048,call_function,permute.default,backward,5,1,1,1,5542,2,4
-6462,dtype_cast_462,call_function,dtype_cast.default,backward,5,1,1,1,5543,1,4
-6463,alias_default_1292,call_function,alias.default,backward,5,1,1,0,5544,0,3
-6464,convert_element_type_1715,call_function,convert_element_type.default,backward,5,1,1,1,5566,569,8
-6465,convert_element_type_1716,call_function,convert_element_type.default,backward,5,1,1,1,559,569,4
-6466,convert_element_type_1717,call_function,convert_element_type.default,backward,5,1,1,1,3,563,2
-6467,alias_default_1162,call_function,alias.default,backward,5,1,1,2,5567,568,4
-6468,mul_658,call_function,mul.Tensor,backward,5,2,2,1,5569,562,8
-6469,mul_659,call_function,mul.Tensor,backward,5,2,2,1,567,568,8
-6470,alias_default_1163,call_function,alias.default,backward,5,1,1,2,5570,561,4
-6471,alias_default_1164,call_function,alias.default,backward,5,1,1,3,568,567,4
-6472,mul_660,call_function,mul.Tensor,backward,5,2,2,1,5574,560,8
-6473,sum_93,call_function,sum.dim_IntList,backward,5,1,1,1,5575,559,5
-6474,div_74,call_function,div.Tensor,backward,5,1,1,1,569,559,6
-6475,mul_661,call_function,mul.Tensor,backward,5,2,2,1,5577,558,8
-6476,sub_69,call_function,sub.Tensor,backward,5,2,2,1,5578,557,10
-6477,mul_662,call_function,mul.Tensor,backward,5,2,2,1,5579,556,8
-6478,mul_663,call_function,mul.Tensor,backward,5,2,2,1,5571,4,8
-6479,sum_94,call_function,sum.dim_IntList,backward,5,1,1,1,5572,3,5
-6480,convert_element_type_1718,call_function,convert_element_type.default,backward,5,1,1,1,5580,555,6
-6481,convert_element_type_1719,call_function,convert_element_type.default,backward,5,1,1,1,5573,2,3
-6482,add_301,call_function,add.Tensor,unknown,,2,2,1,5581,554,10
-6483,dtype_cast_463,call_function,dtype_cast.default,backward,5,1,1,1,5574,1,3
-6484,alias_default_1299,call_function,alias.default,backward,5,1,1,0,5575,0,2
-6485,alias_default_1165,call_function,alias.default,unknown,,1,1,3,5582,553,4
-6486,einsum_default_521,call_function,einsum.default,backward,4,2,2,1,5583,3,5
-6487,permute_1051,call_function,permute.default,backward,4,1,1,1,4,549,3
-6488,einsum_default_522,call_function,einsum.default,backward,4,2,2,1,5584,548,5
-6489,permute_1052,call_function,permute.default,backward,4,1,1,1,5584,2,4
-6490,dtype_cast_464,call_function,dtype_cast.default,backward,4,1,1,1,5585,1,4
-6491,alias_default_1288,call_function,alias.default,backward,4,1,1,0,5586,0,3
-6492,alias_default_1166,call_function,alias.default,backward,4,1,1,2,5585,547,4
-6493,mul_664,call_function,mul.Tensor,backward,4,2,2,1,5586,535,8
-6494,mul_665,call_function,mul.Tensor,backward,4,2,2,1,5586,539,8
-6495,alias_default_1167,call_function,alias.default,backward,4,1,1,2,5587,534,4
-6496,einsum_default_523,call_function,einsum.default,backward,4,2,2,1,5588,3,5
-6497,permute_1055,call_function,permute.default,backward,4,1,1,1,4,530,3
-6498,einsum_default_524,call_function,einsum.default,backward,4,2,2,1,5589,529,5
-6499,permute_1056,call_function,permute.default,backward,4,1,1,1,5589,2,4
-6500,dtype_cast_465,call_function,dtype_cast.default,backward,4,1,1,1,5590,1,4
-6501,alias_default_1289,call_function,alias.default,backward,4,1,1,0,5591,0,3
-6502,convert_element_type_1728,call_function,convert_element_type.default,backward,4,1,1,1,5587,538,6
-6503,convert_element_type_1729,call_function,convert_element_type.default,backward,4,1,1,1,536,548,4
-6504,alias_default_1168,call_function,alias.default,backward,4,1,1,2,537,547,4
-6505,neg_51,call_function,neg.default,backward,4,1,1,1,538,546,8
-6506,exp_51,call_function,exp.default,backward,4,1,1,1,539,545,6
-6507,add_302,call_function,add.Tensor,backward,4,1,1,1,540,544,4
-6508,reciprocal_23,call_function,reciprocal.default,backward,4,1,1,1,541,543,4
-6509,mul_666,call_function,mul.Tensor,backward,4,1,1,1,542,542,6
-6510,alias_default_1169,call_function,alias.default,backward,4,1,1,2,543,541,4
-6511,mul_667,call_function,mul.Tensor,backward,4,2,2,1,5596,537,8
-6512,sub_70,call_function,sub.Tensor,backward,4,1,1,1,544,539,4
-6513,mul_668,call_function,mul.Tensor,backward,4,2,2,1,545,538,8
-6514,add_303,call_function,add.Tensor,backward,4,1,1,1,546,537,4
-6515,mul_669,call_function,mul.Tensor,backward,4,2,2,1,5600,536,8
-6516,convert_element_type_1730,call_function,convert_element_type.default,backward,4,1,1,1,5601,535,6
-6517,alias_default_1170,call_function,alias.default,backward,4,1,1,2,5602,534,4
-6518,einsum_default_525,call_function,einsum.default,backward,4,2,2,1,5603,3,5
-6519,permute_1059,call_function,permute.default,backward,4,1,1,1,4,530,3
-6520,einsum_default_526,call_function,einsum.default,backward,4,2,2,1,5604,529,5
-6521,add_304,call_function,add.Tensor,unknown,,2,2,1,5609,528,10
-6522,permute_1060,call_function,permute.default,backward,4,1,1,1,5604,2,4
-6523,dtype_cast_466,call_function,dtype_cast.default,backward,4,1,1,1,5605,1,4
-6524,alias_default_1287,call_function,alias.default,backward,4,1,1,0,5606,0,3
-6525,convert_element_type_1735,call_function,convert_element_type.default,backward,4,1,1,1,5610,527,8
-6526,convert_element_type_1736,call_function,convert_element_type.default,backward,4,1,1,1,516,527,4
-6527,convert_element_type_1737,call_function,convert_element_type.default,backward,4,1,1,1,3,521,2
-6528,alias_default_1171,call_function,alias.default,backward,4,1,1,2,5611,526,4
-6529,mul_670,call_function,mul.Tensor,backward,4,2,2,1,5613,520,8
-6530,mul_671,call_function,mul.Tensor,backward,4,2,2,1,524,526,8
-6531,alias_default_1172,call_function,alias.default,backward,4,1,1,2,5614,519,4
-6532,alias_default_1173,call_function,alias.default,backward,4,1,1,3,525,525,4
-6533,mul_672,call_function,mul.Tensor,backward,4,2,2,1,5618,518,8
-6534,sum_95,call_function,sum.dim_IntList,backward,4,1,1,1,5619,517,5
-6535,div_75,call_function,div.Tensor,backward,4,1,1,1,526,517,6
-6536,mul_673,call_function,mul.Tensor,backward,4,2,2,1,5621,516,8
-6537,sub_71,call_function,sub.Tensor,backward,4,2,2,1,5622,515,10
-6538,mul_674,call_function,mul.Tensor,backward,4,2,2,1,5623,514,8
-6539,mul_675,call_function,mul.Tensor,backward,4,2,2,1,5615,4,8
-6540,sum_96,call_function,sum.dim_IntList,backward,4,1,1,1,5616,3,5
-6541,convert_element_type_1738,call_function,convert_element_type.default,backward,4,1,1,1,5624,513,6
-6542,convert_element_type_1739,call_function,convert_element_type.default,backward,4,1,1,1,5617,2,3
-6543,add_305,call_function,add.Tensor,unknown,,2,2,1,5625,512,10
-6544,dtype_cast_467,call_function,dtype_cast.default,backward,4,1,1,1,5618,1,3
-6545,alias_default_1291,call_function,alias.default,backward,4,1,1,0,5619,0,2
-6546,alias_default_1174,call_function,alias.default,unknown,,1,1,3,5626,511,4
-6547,einsum_default_527,call_function,einsum.default,backward,4,2,2,1,5627,3,5
-6548,permute_1063,call_function,permute.default,backward,4,1,1,1,4,507,3
-6549,einsum_default_528,call_function,einsum.default,backward,4,2,2,1,5628,506,5
-6550,permute_1064,call_function,permute.default,backward,4,1,1,1,5628,2,4
-6551,dtype_cast_468,call_function,dtype_cast.default,backward,4,1,1,1,5629,1,4
-6552,alias_default_1286,call_function,alias.default,backward,4,1,1,0,5630,0,3
-6553,view_1162,call_function,view.default,backward,4,1,1,1,5629,505,4
-6554,permute_1065,call_function,permute.default,backward,4,1,1,1,5630,504,4
-6555,_scaled_dot_product_flash_attention_backward_23,call_function,_scaled_dot_product_flash_attention_backward.default,backward,4,8,8,3,5634,503,2
-6556,getitem_321,call_function,getitem,backward,4,1,1,1,5635,476,2
-6557,getitem_322,call_function,getitem,backward,4,1,1,1,5635,477,2
-6558,getitem_323,call_function,getitem,backward,4,1,1,1,5635,470,2
-6559,permute_1066,call_function,permute.default,backward,4,1,1,1,5636,469,2
-6560,permute_1067,call_function,permute.default,backward,4,1,1,1,5636,476,2
-6561,permute_1068,call_function,permute.default,backward,4,1,1,1,5636,475,2
-6562,convert_element_type_1744,call_function,convert_element_type.default,backward,4,1,1,1,5637,475,2
-6563,convert_element_type_1745,call_function,convert_element_type.default,backward,4,1,1,1,5637,474,2
-6564,view_1163,call_function,view.default,backward,4,1,1,1,5638,474,2
-6565,view_as_complex_102,call_function,view_as_complex.default,backward,4,1,1,1,5639,473,6
-6566,_conj_46,call_function,_conj.default,backward,4,1,1,1,4,474,3
-6567,clone_190,call_function,clone.default,backward,4,1,1,1,5,473,3
-6568,mul_676,call_function,mul.Tensor,backward,4,2,2,1,5642,472,8
-6569,view_1164,call_function,view.default,backward,4,1,1,1,5638,473,2
-6570,view_as_complex_103,call_function,view_as_complex.default,backward,4,1,1,1,5639,472,6
-6571,_conj_47,call_function,_conj.default,backward,4,1,1,1,4,473,3
-6572,clone_191,call_function,clone.default,backward,4,1,1,1,5,472,3
-6573,mul_677,call_function,mul.Tensor,backward,4,2,2,1,5642,471,8
-6574,view_as_real_102,call_function,view_as_real.default,backward,4,1,1,1,5643,471,6
-6575,view_1165,call_function,view.default,backward,4,1,1,1,5644,470,6
-6576,convert_element_type_1746,call_function,convert_element_type.default,backward,4,1,1,1,5645,469,6
-6577,view_as_real_103,call_function,view_as_real.default,backward,4,1,1,1,5643,470,6
-6578,view_1166,call_function,view.default,backward,4,1,1,1,5644,469,6
-6579,convert_element_type_1747,call_function,convert_element_type.default,backward,4,1,1,1,5645,468,6
-6580,view_1167,call_function,view.default,backward,4,1,1,1,5637,468,2
-6581,view_1168,call_function,view.default,backward,4,1,1,1,5646,468,5
-6582,view_1169,call_function,view.default,backward,4,1,1,1,5646,467,5
-6583,alias_default_1175,call_function,alias.default,backward,4,1,1,2,5638,467,4
-6584,einsum_default_529,call_function,einsum.default,backward,4,2,2,1,5639,3,5
-6585,permute_1071,call_function,permute.default,backward,4,1,1,1,4,463,3
-6586,einsum_default_530,call_function,einsum.default,backward,4,2,2,1,5640,462,5
-6587,permute_1072,call_function,permute.default,backward,4,1,1,1,5640,2,4
-6588,dtype_cast_469,call_function,dtype_cast.default,backward,4,1,1,1,5641,1,4
-6589,alias_default_1285,call_function,alias.default,backward,4,1,1,0,5642,0,3
-6590,alias_default_1176,call_function,alias.default,backward,4,1,1,2,5647,467,4
-6591,einsum_default_531,call_function,einsum.default,backward,4,2,2,1,5648,3,5
-6592,permute_1075,call_function,permute.default,backward,4,1,1,1,4,463,3
-6593,einsum_default_532,call_function,einsum.default,backward,4,2,2,1,5649,462,5
-6594,add_306,call_function,add.Tensor,unknown,,2,2,1,5656,461,10
-6595,permute_1076,call_function,permute.default,backward,4,1,1,1,5649,2,4
-6596,dtype_cast_470,call_function,dtype_cast.default,backward,4,1,1,1,5650,1,4
-6597,alias_default_1284,call_function,alias.default,backward,4,1,1,0,5651,0,3
-6598,alias_default_1177,call_function,alias.default,backward,4,1,1,2,5647,466,4
-6599,einsum_default_533,call_function,einsum.default,backward,4,2,2,1,5648,3,5
-6600,permute_1079,call_function,permute.default,backward,4,1,1,1,4,462,3
-6601,einsum_default_534,call_function,einsum.default,backward,4,2,2,1,5649,461,5
-6602,add_307,call_function,add.Tensor,unknown,,2,2,1,5672,460,10
-6603,permute_1080,call_function,permute.default,backward,4,1,1,1,5649,2,4
-6604,dtype_cast_471,call_function,dtype_cast.default,backward,4,1,1,1,5650,1,4
-6605,alias_default_1283,call_function,alias.default,backward,4,1,1,0,5651,0,3
-6606,convert_element_type_1760,call_function,convert_element_type.default,backward,4,1,1,1,5673,459,8
-6607,convert_element_type_1761,call_function,convert_element_type.default,backward,4,1,1,1,449,459,4
-6608,convert_element_type_1762,call_function,convert_element_type.default,backward,4,1,1,1,3,453,2
-6609,alias_default_1178,call_function,alias.default,backward,4,1,1,2,5674,458,4
-6610,mul_678,call_function,mul.Tensor,backward,4,2,2,1,5676,452,8
-6611,mul_679,call_function,mul.Tensor,backward,4,2,2,1,457,458,8
-6612,alias_default_1179,call_function,alias.default,backward,4,1,1,2,5677,451,4
-6613,alias_default_1180,call_function,alias.default,backward,4,1,1,3,458,457,4
-6614,mul_680,call_function,mul.Tensor,backward,4,2,2,1,5681,450,8
-6615,sum_97,call_function,sum.dim_IntList,backward,4,1,1,1,5682,449,5
-6616,div_76,call_function,div.Tensor,backward,4,1,1,1,459,449,6
-6617,mul_681,call_function,mul.Tensor,backward,4,2,2,1,5684,448,8
-6618,sub_72,call_function,sub.Tensor,backward,4,2,2,1,5685,447,10
-6619,mul_682,call_function,mul.Tensor,backward,4,2,2,1,5686,446,8
-6620,mul_683,call_function,mul.Tensor,backward,4,2,2,1,5678,4,8
-6621,sum_98,call_function,sum.dim_IntList,backward,4,1,1,1,5679,3,5
-6622,convert_element_type_1763,call_function,convert_element_type.default,backward,4,1,1,1,5687,445,6
-6623,convert_element_type_1764,call_function,convert_element_type.default,backward,4,1,1,1,5680,2,3
-6624,add_308,call_function,add.Tensor,unknown,,2,2,1,5688,444,10
-6625,dtype_cast_472,call_function,dtype_cast.default,backward,4,1,1,1,5681,1,3
-6626,alias_default_1290,call_function,alias.default,backward,4,1,1,0,5682,0,2
-6627,alias_default_1181,call_function,alias.default,unknown,,1,1,3,5689,443,4
-6628,einsum_default_535,call_function,einsum.default,backward,3,2,2,1,5690,3,5
-6629,permute_1083,call_function,permute.default,backward,3,1,1,1,4,439,3
-6630,einsum_default_536,call_function,einsum.default,backward,3,2,2,1,5691,438,5
-6631,permute_1084,call_function,permute.default,backward,3,1,1,1,5691,2,4
-6632,dtype_cast_473,call_function,dtype_cast.default,backward,3,1,1,1,5692,1,4
-6633,alias_default_1279,call_function,alias.default,backward,3,1,1,0,5693,0,3
-6634,alias_default_1182,call_function,alias.default,backward,3,1,1,2,5692,437,4
-6635,mul_684,call_function,mul.Tensor,backward,3,2,2,1,5693,425,8
-6636,mul_685,call_function,mul.Tensor,backward,3,2,2,1,5693,429,8
-6637,alias_default_1183,call_function,alias.default,backward,3,1,1,2,5694,424,4
-6638,einsum_default_537,call_function,einsum.default,backward,3,2,2,1,5695,3,5
-6639,permute_1087,call_function,permute.default,backward,3,1,1,1,4,420,3
-6640,einsum_default_538,call_function,einsum.default,backward,3,2,2,1,5696,419,5
-6641,permute_1088,call_function,permute.default,backward,3,1,1,1,5696,2,4
-6642,dtype_cast_474,call_function,dtype_cast.default,backward,3,1,1,1,5697,1,4
-6643,alias_default_1280,call_function,alias.default,backward,3,1,1,0,5698,0,3
-6644,convert_element_type_1773,call_function,convert_element_type.default,backward,3,1,1,1,5694,428,6
-6645,convert_element_type_1774,call_function,convert_element_type.default,backward,3,1,1,1,426,438,4
-6646,alias_default_1184,call_function,alias.default,backward,3,1,1,2,427,437,4
-6647,neg_52,call_function,neg.default,backward,3,1,1,1,428,436,8
-6648,exp_52,call_function,exp.default,backward,3,1,1,1,429,435,6
-6649,add_309,call_function,add.Tensor,backward,3,1,1,1,430,434,4
-6650,reciprocal_24,call_function,reciprocal.default,backward,3,1,1,1,431,433,4
-6651,mul_686,call_function,mul.Tensor,backward,3,1,1,1,432,432,6
-6652,alias_default_1185,call_function,alias.default,backward,3,1,1,2,433,431,4
-6653,mul_687,call_function,mul.Tensor,backward,3,2,2,1,5703,427,8
-6654,sub_73,call_function,sub.Tensor,backward,3,1,1,1,434,429,4
-6655,mul_688,call_function,mul.Tensor,backward,3,2,2,1,435,428,8
-6656,add_310,call_function,add.Tensor,backward,3,1,1,1,436,427,4
-6657,mul_689,call_function,mul.Tensor,backward,3,2,2,1,5707,426,8
-6658,convert_element_type_1775,call_function,convert_element_type.default,backward,3,1,1,1,5708,425,6
-6659,alias_default_1186,call_function,alias.default,backward,3,1,1,2,5709,424,4
-6660,einsum_default_539,call_function,einsum.default,backward,3,2,2,1,5710,3,5
-6661,permute_1091,call_function,permute.default,backward,3,1,1,1,4,420,3
-6662,einsum_default_540,call_function,einsum.default,backward,3,2,2,1,5711,419,5
-6663,add_311,call_function,add.Tensor,unknown,,2,2,1,5716,418,10
-6664,permute_1092,call_function,permute.default,backward,3,1,1,1,5711,2,4
-6665,dtype_cast_475,call_function,dtype_cast.default,backward,3,1,1,1,5712,1,4
-6666,alias_default_1278,call_function,alias.default,backward,3,1,1,0,5713,0,3
-6667,convert_element_type_1780,call_function,convert_element_type.default,backward,3,1,1,1,5717,417,8
-6668,convert_element_type_1781,call_function,convert_element_type.default,backward,3,1,1,1,406,417,4
-6669,convert_element_type_1782,call_function,convert_element_type.default,backward,3,1,1,1,3,411,2
-6670,alias_default_1187,call_function,alias.default,backward,3,1,1,2,5718,416,4
-6671,mul_690,call_function,mul.Tensor,backward,3,2,2,1,5720,410,8
-6672,mul_691,call_function,mul.Tensor,backward,3,2,2,1,414,416,8
-6673,alias_default_1188,call_function,alias.default,backward,3,1,1,2,5721,409,4
-6674,alias_default_1189,call_function,alias.default,backward,3,1,1,3,415,415,4
-6675,mul_692,call_function,mul.Tensor,backward,3,2,2,1,5725,408,8
-6676,sum_99,call_function,sum.dim_IntList,backward,3,1,1,1,5726,407,5
-6677,div_77,call_function,div.Tensor,backward,3,1,1,1,416,407,6
-6678,mul_693,call_function,mul.Tensor,backward,3,2,2,1,5728,406,8
-6679,sub_74,call_function,sub.Tensor,backward,3,2,2,1,5729,405,10
-6680,mul_694,call_function,mul.Tensor,backward,3,2,2,1,5730,404,8
-6681,mul_695,call_function,mul.Tensor,backward,3,2,2,1,5722,4,8
-6682,sum_100,call_function,sum.dim_IntList,backward,3,1,1,1,5723,3,5
-6683,convert_element_type_1783,call_function,convert_element_type.default,backward,3,1,1,1,5731,403,6
-6684,convert_element_type_1784,call_function,convert_element_type.default,backward,3,1,1,1,5724,2,3
-6685,add_312,call_function,add.Tensor,unknown,,2,2,1,5732,402,10
-6686,dtype_cast_476,call_function,dtype_cast.default,backward,3,1,1,1,5725,1,3
-6687,alias_default_1282,call_function,alias.default,backward,3,1,1,0,5726,0,2
-6688,alias_default_1190,call_function,alias.default,unknown,,1,1,3,5733,401,4
-6689,einsum_default_541,call_function,einsum.default,backward,3,2,2,1,5734,3,5
-6690,permute_1095,call_function,permute.default,backward,3,1,1,1,4,397,3
-6691,einsum_default_542,call_function,einsum.default,backward,3,2,2,1,5735,396,5
-6692,permute_1096,call_function,permute.default,backward,3,1,1,1,5735,2,4
-6693,dtype_cast_477,call_function,dtype_cast.default,backward,3,1,1,1,5736,1,4
-6694,alias_default_1277,call_function,alias.default,backward,3,1,1,0,5737,0,3
-6695,view_1184,call_function,view.default,backward,3,1,1,1,5736,395,4
-6696,permute_1097,call_function,permute.default,backward,3,1,1,1,5737,394,4
-6697,_scaled_dot_product_flash_attention_backward_24,call_function,_scaled_dot_product_flash_attention_backward.default,backward,3,8,8,3,5741,393,2
-6698,getitem_324,call_function,getitem,backward,3,1,1,1,5742,366,2
-6699,getitem_325,call_function,getitem,backward,3,1,1,1,5742,367,2
-6700,getitem_326,call_function,getitem,backward,3,1,1,1,5742,360,2
-6701,permute_1098,call_function,permute.default,backward,3,1,1,1,5743,359,2
-6702,permute_1099,call_function,permute.default,backward,3,1,1,1,5743,366,2
-6703,permute_1100,call_function,permute.default,backward,3,1,1,1,5743,365,2
-6704,convert_element_type_1789,call_function,convert_element_type.default,backward,3,1,1,1,5744,365,2
-6705,convert_element_type_1790,call_function,convert_element_type.default,backward,3,1,1,1,5744,364,2
-6706,view_1185,call_function,view.default,backward,3,1,1,1,5745,364,2
-6707,view_as_complex_104,call_function,view_as_complex.default,backward,3,1,1,1,5746,363,6
-6708,_conj_48,call_function,_conj.default,backward,3,1,1,1,4,364,3
-6709,clone_198,call_function,clone.default,backward,3,1,1,1,5,363,3
-6710,mul_696,call_function,mul.Tensor,backward,3,2,2,1,5749,362,8
-6711,view_1186,call_function,view.default,backward,3,1,1,1,5745,363,2
-6712,view_as_complex_105,call_function,view_as_complex.default,backward,3,1,1,1,5746,362,6
-6713,_conj_49,call_function,_conj.default,backward,3,1,1,1,4,363,3
-6714,clone_199,call_function,clone.default,backward,3,1,1,1,5,362,3
-6715,mul_697,call_function,mul.Tensor,backward,3,2,2,1,5749,361,8
-6716,view_as_real_104,call_function,view_as_real.default,backward,3,1,1,1,5750,361,6
-6717,view_1187,call_function,view.default,backward,3,1,1,1,5751,360,6
-6718,convert_element_type_1791,call_function,convert_element_type.default,backward,3,1,1,1,5752,359,6
-6719,view_as_real_105,call_function,view_as_real.default,backward,3,1,1,1,5750,360,6
-6720,view_1188,call_function,view.default,backward,3,1,1,1,5751,359,6
-6721,convert_element_type_1792,call_function,convert_element_type.default,backward,3,1,1,1,5752,358,6
-6722,view_1189,call_function,view.default,backward,3,1,1,1,5744,358,2
-6723,view_1190,call_function,view.default,backward,3,1,1,1,5753,358,5
-6724,view_1191,call_function,view.default,backward,3,1,1,1,5753,357,5
-6725,alias_default_1191,call_function,alias.default,backward,3,1,1,2,5745,357,4
-6726,einsum_default_543,call_function,einsum.default,backward,3,2,2,1,5746,3,5
-6727,permute_1103,call_function,permute.default,backward,3,1,1,1,4,353,3
-6728,einsum_default_544,call_function,einsum.default,backward,3,2,2,1,5747,352,5
-6729,permute_1104,call_function,permute.default,backward,3,1,1,1,5747,2,4
-6730,dtype_cast_478,call_function,dtype_cast.default,backward,3,1,1,1,5748,1,4
-6731,alias_default_1276,call_function,alias.default,backward,3,1,1,0,5749,0,3
-6732,alias_default_1192,call_function,alias.default,backward,3,1,1,2,5754,357,4
-6733,einsum_default_545,call_function,einsum.default,backward,3,2,2,1,5755,3,5
-6734,permute_1107,call_function,permute.default,backward,3,1,1,1,4,353,3
-6735,einsum_default_546,call_function,einsum.default,backward,3,2,2,1,5756,352,5
-6736,add_313,call_function,add.Tensor,unknown,,2,2,1,5763,351,10
-6737,permute_1108,call_function,permute.default,backward,3,1,1,1,5756,2,4
-6738,dtype_cast_479,call_function,dtype_cast.default,backward,3,1,1,1,5757,1,4
-6739,alias_default_1275,call_function,alias.default,backward,3,1,1,0,5758,0,3
-6740,alias_default_1193,call_function,alias.default,backward,3,1,1,2,5754,356,4
-6741,einsum_default_547,call_function,einsum.default,backward,3,2,2,1,5755,3,5
-6742,permute_1111,call_function,permute.default,backward,3,1,1,1,4,352,3
-6743,einsum_default_548,call_function,einsum.default,backward,3,2,2,1,5756,351,5
-6744,add_314,call_function,add.Tensor,unknown,,2,2,1,5779,350,10
-6745,permute_1112,call_function,permute.default,backward,3,1,1,1,5756,2,4
-6746,dtype_cast_480,call_function,dtype_cast.default,backward,3,1,1,1,5757,1,4
-6747,alias_default_1274,call_function,alias.default,backward,3,1,1,0,5758,0,3
-6748,convert_element_type_1805,call_function,convert_element_type.default,backward,3,1,1,1,5780,349,8
-6749,convert_element_type_1806,call_function,convert_element_type.default,backward,3,1,1,1,339,349,4
-6750,convert_element_type_1807,call_function,convert_element_type.default,backward,3,1,1,1,3,343,2
-6751,alias_default_1194,call_function,alias.default,backward,3,1,1,2,5781,348,4
-6752,mul_698,call_function,mul.Tensor,backward,3,2,2,1,5783,342,8
-6753,mul_699,call_function,mul.Tensor,backward,3,2,2,1,347,348,8
-6754,alias_default_1195,call_function,alias.default,backward,3,1,1,2,5784,341,4
-6755,alias_default_1196,call_function,alias.default,backward,3,1,1,3,348,347,4
-6756,mul_700,call_function,mul.Tensor,backward,3,2,2,1,5788,340,8
-6757,sum_101,call_function,sum.dim_IntList,backward,3,1,1,1,5789,339,5
-6758,div_78,call_function,div.Tensor,backward,3,1,1,1,349,339,6
-6759,mul_701,call_function,mul.Tensor,backward,3,2,2,1,5791,338,8
-6760,sub_75,call_function,sub.Tensor,backward,3,2,2,1,5792,337,10
-6761,mul_702,call_function,mul.Tensor,backward,3,2,2,1,5793,336,8
-6762,mul_703,call_function,mul.Tensor,backward,3,2,2,1,5785,4,8
-6763,sum_102,call_function,sum.dim_IntList,backward,3,1,1,1,5786,3,5
-6764,convert_element_type_1808,call_function,convert_element_type.default,backward,3,1,1,1,5794,335,6
-6765,convert_element_type_1809,call_function,convert_element_type.default,backward,3,1,1,1,5787,2,3
-6766,add_315,call_function,add.Tensor,unknown,,2,2,1,5795,334,10
-6767,dtype_cast_481,call_function,dtype_cast.default,backward,3,1,1,1,5788,1,3
-6768,alias_default_1281,call_function,alias.default,backward,3,1,1,0,5789,0,2
-6769,alias_default_1197,call_function,alias.default,unknown,,1,1,3,5796,333,4
-6770,einsum_default_549,call_function,einsum.default,backward,2,2,2,1,5797,3,5
-6771,permute_1115,call_function,permute.default,backward,2,1,1,1,4,329,3
-6772,einsum_default_550,call_function,einsum.default,backward,2,2,2,1,5798,328,5
-6773,permute_1116,call_function,permute.default,backward,2,1,1,1,5798,2,4
-6774,dtype_cast_482,call_function,dtype_cast.default,backward,2,1,1,1,5799,1,4
-6775,alias_default_1270,call_function,alias.default,backward,2,1,1,0,5800,0,3
-6776,alias_default_1198,call_function,alias.default,backward,2,1,1,2,5799,327,4
-6777,mul_704,call_function,mul.Tensor,backward,2,2,2,1,5800,315,8
-6778,mul_705,call_function,mul.Tensor,backward,2,2,2,1,5800,319,8
-6779,alias_default_1199,call_function,alias.default,backward,2,1,1,2,5801,314,4
-6780,einsum_default_551,call_function,einsum.default,backward,2,2,2,1,5802,3,5
-6781,permute_1119,call_function,permute.default,backward,2,1,1,1,4,310,3
-6782,einsum_default_552,call_function,einsum.default,backward,2,2,2,1,5803,309,5
-6783,permute_1120,call_function,permute.default,backward,2,1,1,1,5803,2,4
-6784,dtype_cast_483,call_function,dtype_cast.default,backward,2,1,1,1,5804,1,4
-6785,alias_default_1271,call_function,alias.default,backward,2,1,1,0,5805,0,3
-6786,convert_element_type_1818,call_function,convert_element_type.default,backward,2,1,1,1,5801,318,6
-6787,convert_element_type_1819,call_function,convert_element_type.default,backward,2,1,1,1,316,328,4
-6788,alias_default_1200,call_function,alias.default,backward,2,1,1,2,317,327,4
-6789,neg_53,call_function,neg.default,backward,2,1,1,1,318,326,8
-6790,exp_53,call_function,exp.default,backward,2,1,1,1,319,325,6
-6791,add_316,call_function,add.Tensor,backward,2,1,1,1,320,324,4
-6792,reciprocal_25,call_function,reciprocal.default,backward,2,1,1,1,321,323,4
-6793,mul_706,call_function,mul.Tensor,backward,2,1,1,1,322,322,6
-6794,alias_default_1201,call_function,alias.default,backward,2,1,1,2,323,321,4
-6795,mul_707,call_function,mul.Tensor,backward,2,2,2,1,5810,317,8
-6796,sub_76,call_function,sub.Tensor,backward,2,1,1,1,324,319,4
-6797,mul_708,call_function,mul.Tensor,backward,2,2,2,1,325,318,8
-6798,add_317,call_function,add.Tensor,backward,2,1,1,1,326,317,4
-6799,mul_709,call_function,mul.Tensor,backward,2,2,2,1,5814,316,8
-6800,convert_element_type_1820,call_function,convert_element_type.default,backward,2,1,1,1,5815,315,6
-6801,alias_default_1202,call_function,alias.default,backward,2,1,1,2,5816,314,4
-6802,einsum_default_553,call_function,einsum.default,backward,2,2,2,1,5817,3,5
-6803,permute_1123,call_function,permute.default,backward,2,1,1,1,4,310,3
-6804,einsum_default_554,call_function,einsum.default,backward,2,2,2,1,5818,309,5
-6805,add_318,call_function,add.Tensor,unknown,,2,2,1,5823,308,10
-6806,permute_1124,call_function,permute.default,backward,2,1,1,1,5818,2,4
-6807,dtype_cast_484,call_function,dtype_cast.default,backward,2,1,1,1,5819,1,4
-6808,alias_default_1269,call_function,alias.default,backward,2,1,1,0,5820,0,3
-6809,convert_element_type_1825,call_function,convert_element_type.default,backward,2,1,1,1,5824,307,8
-6810,convert_element_type_1826,call_function,convert_element_type.default,backward,2,1,1,1,296,307,4
-6811,convert_element_type_1827,call_function,convert_element_type.default,backward,2,1,1,1,3,301,2
-6812,alias_default_1203,call_function,alias.default,backward,2,1,1,2,5825,306,4
-6813,mul_710,call_function,mul.Tensor,backward,2,2,2,1,5827,300,8
-6814,mul_711,call_function,mul.Tensor,backward,2,2,2,1,304,306,8
-6815,alias_default_1204,call_function,alias.default,backward,2,1,1,2,5828,299,4
-6816,alias_default_1205,call_function,alias.default,backward,2,1,1,3,305,305,4
-6817,mul_712,call_function,mul.Tensor,backward,2,2,2,1,5832,298,8
-6818,sum_103,call_function,sum.dim_IntList,backward,2,1,1,1,5833,297,5
-6819,div_79,call_function,div.Tensor,backward,2,1,1,1,306,297,6
-6820,mul_713,call_function,mul.Tensor,backward,2,2,2,1,5835,296,8
-6821,sub_77,call_function,sub.Tensor,backward,2,2,2,1,5836,295,10
-6822,mul_714,call_function,mul.Tensor,backward,2,2,2,1,5837,294,8
-6823,mul_715,call_function,mul.Tensor,backward,2,2,2,1,5829,4,8
-6824,sum_104,call_function,sum.dim_IntList,backward,2,1,1,1,5830,3,5
-6825,convert_element_type_1828,call_function,convert_element_type.default,backward,2,1,1,1,5838,293,6
-6826,convert_element_type_1829,call_function,convert_element_type.default,backward,2,1,1,1,5831,2,3
-6827,add_319,call_function,add.Tensor,unknown,,2,2,1,5839,292,10
-6828,dtype_cast_485,call_function,dtype_cast.default,backward,2,1,1,1,5832,1,3
-6829,alias_default_1273,call_function,alias.default,backward,2,1,1,0,5833,0,2
-6830,alias_default_1206,call_function,alias.default,unknown,,1,1,3,5840,291,4
-6831,einsum_default_555,call_function,einsum.default,backward,2,2,2,1,5841,3,5
-6832,permute_1127,call_function,permute.default,backward,2,1,1,1,4,287,3
-6833,einsum_default_556,call_function,einsum.default,backward,2,2,2,1,5842,286,5
-6834,permute_1128,call_function,permute.default,backward,2,1,1,1,5842,2,4
-6835,dtype_cast_486,call_function,dtype_cast.default,backward,2,1,1,1,5843,1,4
-6836,alias_default_1268,call_function,alias.default,backward,2,1,1,0,5844,0,3
-6837,view_1206,call_function,view.default,backward,2,1,1,1,5843,285,4
-6838,permute_1129,call_function,permute.default,backward,2,1,1,1,5844,284,4
-6839,_scaled_dot_product_flash_attention_backward_25,call_function,_scaled_dot_product_flash_attention_backward.default,backward,2,8,8,3,5848,283,2
-6840,getitem_327,call_function,getitem,backward,2,1,1,1,5849,256,2
-6841,getitem_328,call_function,getitem,backward,2,1,1,1,5849,257,2
-6842,getitem_329,call_function,getitem,backward,2,1,1,1,5849,250,2
-6843,permute_1130,call_function,permute.default,backward,2,1,1,1,5850,249,2
-6844,permute_1131,call_function,permute.default,backward,2,1,1,1,5850,256,2
-6845,permute_1132,call_function,permute.default,backward,2,1,1,1,5850,255,2
-6846,convert_element_type_1834,call_function,convert_element_type.default,backward,2,1,1,1,5851,255,2
-6847,convert_element_type_1835,call_function,convert_element_type.default,backward,2,1,1,1,5851,254,2
-6848,view_1207,call_function,view.default,backward,2,1,1,1,5852,254,2
-6849,view_as_complex_106,call_function,view_as_complex.default,backward,2,1,1,1,5853,253,6
-6850,_conj_50,call_function,_conj.default,backward,2,1,1,1,4,254,3
-6851,clone_206,call_function,clone.default,backward,2,1,1,1,5,253,3
-6852,mul_716,call_function,mul.Tensor,backward,2,2,2,1,5856,252,8
-6853,view_1208,call_function,view.default,backward,2,1,1,1,5852,253,2
-6854,view_as_complex_107,call_function,view_as_complex.default,backward,2,1,1,1,5853,252,6
-6855,_conj_51,call_function,_conj.default,backward,2,1,1,1,4,253,3
-6856,clone_207,call_function,clone.default,backward,2,1,1,1,5,252,3
-6857,mul_717,call_function,mul.Tensor,backward,2,2,2,1,5856,251,8
-6858,view_as_real_106,call_function,view_as_real.default,backward,2,1,1,1,5857,251,6
-6859,view_1209,call_function,view.default,backward,2,1,1,1,5858,250,6
-6860,convert_element_type_1836,call_function,convert_element_type.default,backward,2,1,1,1,5859,249,6
-6861,view_as_real_107,call_function,view_as_real.default,backward,2,1,1,1,5857,250,6
-6862,view_1210,call_function,view.default,backward,2,1,1,1,5858,249,6
-6863,convert_element_type_1837,call_function,convert_element_type.default,backward,2,1,1,1,5859,248,6
-6864,view_1211,call_function,view.default,backward,2,1,1,1,5851,248,2
-6865,view_1212,call_function,view.default,backward,2,1,1,1,5860,248,5
-6866,view_1213,call_function,view.default,backward,2,1,1,1,5860,247,5
-6867,alias_default_1207,call_function,alias.default,backward,2,1,1,2,5852,247,4
-6868,einsum_default_557,call_function,einsum.default,backward,2,2,2,1,5853,3,5
-6869,permute_1135,call_function,permute.default,backward,2,1,1,1,4,243,3
-6870,einsum_default_558,call_function,einsum.default,backward,2,2,2,1,5854,242,5
-6871,permute_1136,call_function,permute.default,backward,2,1,1,1,5854,2,4
-6872,dtype_cast_487,call_function,dtype_cast.default,backward,2,1,1,1,5855,1,4
-6873,alias_default_1267,call_function,alias.default,backward,2,1,1,0,5856,0,3
-6874,alias_default_1208,call_function,alias.default,backward,2,1,1,2,5861,247,4
-6875,einsum_default_559,call_function,einsum.default,backward,2,2,2,1,5862,3,5
-6876,permute_1139,call_function,permute.default,backward,2,1,1,1,4,243,3
-6877,einsum_default_560,call_function,einsum.default,backward,2,2,2,1,5863,242,5
-6878,add_320,call_function,add.Tensor,unknown,,2,2,1,5870,241,10
-6879,permute_1140,call_function,permute.default,backward,2,1,1,1,5863,2,4
-6880,dtype_cast_488,call_function,dtype_cast.default,backward,2,1,1,1,5864,1,4
-6881,alias_default_1266,call_function,alias.default,backward,2,1,1,0,5865,0,3
-6882,alias_default_1209,call_function,alias.default,backward,2,1,1,2,5861,246,4
-6883,einsum_default_561,call_function,einsum.default,backward,2,2,2,1,5862,3,5
-6884,permute_1143,call_function,permute.default,backward,2,1,1,1,4,242,3
-6885,einsum_default_562,call_function,einsum.default,backward,2,2,2,1,5863,241,5
-6886,add_321,call_function,add.Tensor,unknown,,2,2,1,5886,240,10
-6887,permute_1144,call_function,permute.default,backward,2,1,1,1,5863,2,4
-6888,dtype_cast_489,call_function,dtype_cast.default,backward,2,1,1,1,5864,1,4
-6889,alias_default_1265,call_function,alias.default,backward,2,1,1,0,5865,0,3
-6890,convert_element_type_1850,call_function,convert_element_type.default,backward,2,1,1,1,5887,239,8
-6891,convert_element_type_1851,call_function,convert_element_type.default,backward,2,1,1,1,229,239,4
-6892,convert_element_type_1852,call_function,convert_element_type.default,backward,2,1,1,1,3,233,2
-6893,alias_default_1210,call_function,alias.default,backward,2,1,1,2,5888,238,4
-6894,mul_718,call_function,mul.Tensor,backward,2,2,2,1,5890,232,8
-6895,mul_719,call_function,mul.Tensor,backward,2,2,2,1,237,238,8
-6896,alias_default_1211,call_function,alias.default,backward,2,1,1,2,5891,231,4
-6897,alias_default_1212,call_function,alias.default,backward,2,1,1,3,238,237,4
-6898,mul_720,call_function,mul.Tensor,backward,2,2,2,1,5895,230,8
-6899,sum_105,call_function,sum.dim_IntList,backward,2,1,1,1,5896,229,5
-6900,div_80,call_function,div.Tensor,backward,2,1,1,1,239,229,6
-6901,mul_721,call_function,mul.Tensor,backward,2,2,2,1,5898,228,8
-6902,sub_78,call_function,sub.Tensor,backward,2,2,2,1,5899,227,10
-6903,mul_722,call_function,mul.Tensor,backward,2,2,2,1,5900,226,8
-6904,mul_723,call_function,mul.Tensor,backward,2,2,2,1,5892,4,8
-6905,sum_106,call_function,sum.dim_IntList,backward,2,1,1,1,5893,3,5
-6906,convert_element_type_1853,call_function,convert_element_type.default,backward,2,1,1,1,5901,225,6
-6907,convert_element_type_1854,call_function,convert_element_type.default,backward,2,1,1,1,5894,2,3
-6908,add_322,call_function,add.Tensor,unknown,,2,2,1,5902,224,10
-6909,dtype_cast_490,call_function,dtype_cast.default,backward,2,1,1,1,5895,1,3
-6910,alias_default_1272,call_function,alias.default,backward,2,1,1,0,5896,0,2
-6911,alias_default_1213,call_function,alias.default,unknown,,1,1,3,5903,223,4
-6912,einsum_default_563,call_function,einsum.default,backward,1,2,2,1,5904,3,5
-6913,permute_1147,call_function,permute.default,backward,1,1,1,1,4,219,3
-6914,einsum_default_564,call_function,einsum.default,backward,1,2,2,1,5905,218,5
-6915,permute_1148,call_function,permute.default,backward,1,1,1,1,5905,2,4
-6916,dtype_cast_491,call_function,dtype_cast.default,backward,1,1,1,1,5906,1,4
-6917,alias_default_1261,call_function,alias.default,backward,1,1,1,0,5907,0,3
-6918,alias_default_1214,call_function,alias.default,backward,1,1,1,2,5906,217,4
-6919,mul_724,call_function,mul.Tensor,backward,1,2,2,1,5907,205,8
-6920,mul_725,call_function,mul.Tensor,backward,1,2,2,1,5907,209,8
-6921,alias_default_1215,call_function,alias.default,backward,1,1,1,2,5908,204,4
-6922,einsum_default_565,call_function,einsum.default,backward,1,2,2,1,5909,3,5
-6923,permute_1151,call_function,permute.default,backward,1,1,1,1,4,200,3
-6924,einsum_default_566,call_function,einsum.default,backward,1,2,2,1,5910,199,5
-6925,permute_1152,call_function,permute.default,backward,1,1,1,1,5910,2,4
-6926,dtype_cast_492,call_function,dtype_cast.default,backward,1,1,1,1,5911,1,4
-6927,alias_default_1262,call_function,alias.default,backward,1,1,1,0,5912,0,3
-6928,convert_element_type_1863,call_function,convert_element_type.default,backward,1,1,1,1,5908,208,6
-6929,convert_element_type_1864,call_function,convert_element_type.default,backward,1,1,1,1,206,218,4
-6930,alias_default_1216,call_function,alias.default,backward,1,1,1,2,207,217,4
-6931,neg_54,call_function,neg.default,backward,1,1,1,1,208,216,8
-6932,exp_54,call_function,exp.default,backward,1,1,1,1,209,215,6
-6933,add_323,call_function,add.Tensor,backward,1,1,1,1,210,214,4
-6934,reciprocal_26,call_function,reciprocal.default,backward,1,1,1,1,211,213,4
-6935,mul_726,call_function,mul.Tensor,backward,1,1,1,1,212,212,6
-6936,alias_default_1217,call_function,alias.default,backward,1,1,1,2,213,211,4
-6937,mul_727,call_function,mul.Tensor,backward,1,2,2,1,5917,207,8
-6938,sub_79,call_function,sub.Tensor,backward,1,1,1,1,214,209,4
-6939,mul_728,call_function,mul.Tensor,backward,1,2,2,1,215,208,8
-6940,add_324,call_function,add.Tensor,backward,1,1,1,1,216,207,4
-6941,mul_729,call_function,mul.Tensor,backward,1,2,2,1,5921,206,8
-6942,convert_element_type_1865,call_function,convert_element_type.default,backward,1,1,1,1,5922,205,6
-6943,alias_default_1218,call_function,alias.default,backward,1,1,1,2,5923,204,4
-6944,einsum_default_567,call_function,einsum.default,backward,1,2,2,1,5924,3,5
-6945,permute_1155,call_function,permute.default,backward,1,1,1,1,4,200,3
-6946,einsum_default_568,call_function,einsum.default,backward,1,2,2,1,5925,199,5
-6947,add_325,call_function,add.Tensor,unknown,,2,2,1,5930,198,10
-6948,permute_1156,call_function,permute.default,backward,1,1,1,1,5925,2,4
-6949,dtype_cast_493,call_function,dtype_cast.default,backward,1,1,1,1,5926,1,4
-6950,alias_default_1260,call_function,alias.default,backward,1,1,1,0,5927,0,3
-6951,convert_element_type_1870,call_function,convert_element_type.default,backward,1,1,1,1,5931,197,8
-6952,convert_element_type_1871,call_function,convert_element_type.default,backward,1,1,1,1,186,197,4
-6953,convert_element_type_1872,call_function,convert_element_type.default,backward,1,1,1,1,3,191,2
-6954,alias_default_1219,call_function,alias.default,backward,1,1,1,2,5932,196,4
-6955,mul_730,call_function,mul.Tensor,backward,1,2,2,1,5934,190,8
-6956,mul_731,call_function,mul.Tensor,backward,1,2,2,1,194,196,8
-6957,alias_default_1220,call_function,alias.default,backward,1,1,1,2,5935,189,4
-6958,alias_default_1221,call_function,alias.default,backward,1,1,1,3,195,195,4
-6959,mul_732,call_function,mul.Tensor,backward,1,2,2,1,5939,188,8
-6960,sum_107,call_function,sum.dim_IntList,backward,1,1,1,1,5940,187,5
-6961,div_81,call_function,div.Tensor,backward,1,1,1,1,196,187,6
-6962,mul_733,call_function,mul.Tensor,backward,1,2,2,1,5942,186,8
-6963,sub_80,call_function,sub.Tensor,backward,1,2,2,1,5943,185,10
-6964,mul_734,call_function,mul.Tensor,backward,1,2,2,1,5944,184,8
-6965,mul_735,call_function,mul.Tensor,backward,1,2,2,1,5936,4,8
-6966,sum_108,call_function,sum.dim_IntList,backward,1,1,1,1,5937,3,5
-6967,convert_element_type_1873,call_function,convert_element_type.default,backward,1,1,1,1,5945,183,6
-6968,convert_element_type_1874,call_function,convert_element_type.default,backward,1,1,1,1,5938,2,3
-6969,add_326,call_function,add.Tensor,unknown,,2,2,1,5946,182,10
-6970,dtype_cast_494,call_function,dtype_cast.default,backward,1,1,1,1,5939,1,3
-6971,alias_default_1264,call_function,alias.default,backward,1,1,1,0,5940,0,2
-6972,alias_default_1222,call_function,alias.default,unknown,,1,1,3,5947,181,4
-6973,einsum_default_569,call_function,einsum.default,backward,1,2,2,1,5948,3,5
-6974,permute_1159,call_function,permute.default,backward,1,1,1,1,4,177,3
-6975,einsum_default_570,call_function,einsum.default,backward,1,2,2,1,5949,176,5
-6976,permute_1160,call_function,permute.default,backward,1,1,1,1,5949,2,4
-6977,dtype_cast_495,call_function,dtype_cast.default,backward,1,1,1,1,5950,1,4
-6978,alias_default_1259,call_function,alias.default,backward,1,1,1,0,5951,0,3
-6979,view_1228,call_function,view.default,backward,1,1,1,1,5950,175,4
-6980,permute_1161,call_function,permute.default,backward,1,1,1,1,5951,174,4
-6981,_scaled_dot_product_flash_attention_backward_26,call_function,_scaled_dot_product_flash_attention_backward.default,backward,1,8,8,3,5955,173,2
-6982,getitem_330,call_function,getitem,backward,1,1,1,1,5956,146,2
-6983,getitem_331,call_function,getitem,backward,1,1,1,1,5956,147,2
-6984,getitem_332,call_function,getitem,backward,1,1,1,1,5956,140,2
-6985,permute_1162,call_function,permute.default,backward,1,1,1,1,5957,139,2
-6986,permute_1163,call_function,permute.default,backward,1,1,1,1,5957,146,2
-6987,permute_1164,call_function,permute.default,backward,1,1,1,1,5957,145,2
-6988,convert_element_type_1879,call_function,convert_element_type.default,backward,1,1,1,1,5958,145,2
-6989,convert_element_type_1880,call_function,convert_element_type.default,backward,1,1,1,1,5958,144,2
-6990,view_1229,call_function,view.default,backward,1,1,1,1,5959,144,2
-6991,view_as_complex_108,call_function,view_as_complex.default,backward,1,1,1,1,5960,143,6
-6992,_conj_52,call_function,_conj.default,backward,1,1,1,1,4,144,3
-6993,clone_214,call_function,clone.default,backward,1,1,1,1,5,143,3
-6994,mul_736,call_function,mul.Tensor,backward,1,2,2,1,5963,142,8
-6995,view_1230,call_function,view.default,backward,1,1,1,1,5959,143,2
-6996,view_as_complex_109,call_function,view_as_complex.default,backward,1,1,1,1,5960,142,6
-6997,_conj_53,call_function,_conj.default,backward,1,1,1,1,4,143,3
-6998,clone_215,call_function,clone.default,backward,1,1,1,1,5,142,3
-6999,mul_737,call_function,mul.Tensor,backward,1,2,2,1,5963,141,8
-7000,view_as_real_108,call_function,view_as_real.default,backward,1,1,1,1,5964,141,6
-7001,view_1231,call_function,view.default,backward,1,1,1,1,5965,140,6
-7002,convert_element_type_1881,call_function,convert_element_type.default,backward,1,1,1,1,5966,139,6
-7003,view_as_real_109,call_function,view_as_real.default,backward,1,1,1,1,5964,140,6
-7004,view_1232,call_function,view.default,backward,1,1,1,1,5965,139,6
-7005,convert_element_type_1882,call_function,convert_element_type.default,backward,1,1,1,1,5966,138,6
-7006,view_1233,call_function,view.default,backward,1,1,1,1,5958,138,2
-7007,view_1234,call_function,view.default,backward,1,1,1,1,5967,138,5
-7008,view_1235,call_function,view.default,backward,1,1,1,1,5967,137,5
-7009,alias_default_1223,call_function,alias.default,backward,1,1,1,2,5959,137,4
-7010,einsum_default_571,call_function,einsum.default,backward,1,2,2,1,5960,3,5
-7011,permute_1167,call_function,permute.default,backward,1,1,1,1,4,133,3
-7012,einsum_default_572,call_function,einsum.default,backward,1,2,2,1,5961,132,5
-7013,permute_1168,call_function,permute.default,backward,1,1,1,1,5961,2,4
-7014,dtype_cast_496,call_function,dtype_cast.default,backward,1,1,1,1,5962,1,4
-7015,alias_default_1258,call_function,alias.default,backward,1,1,1,0,5963,0,3
-7016,alias_default_1224,call_function,alias.default,backward,1,1,1,2,5968,137,4
-7017,einsum_default_573,call_function,einsum.default,backward,1,2,2,1,5969,3,5
-7018,permute_1171,call_function,permute.default,backward,1,1,1,1,4,133,3
-7019,einsum_default_574,call_function,einsum.default,backward,1,2,2,1,5970,132,5
-7020,add_327,call_function,add.Tensor,unknown,,2,2,1,5977,131,10
-7021,permute_1172,call_function,permute.default,backward,1,1,1,1,5970,2,4
-7022,dtype_cast_497,call_function,dtype_cast.default,backward,1,1,1,1,5971,1,4
-7023,alias_default_1257,call_function,alias.default,backward,1,1,1,0,5972,0,3
-7024,alias_default_1225,call_function,alias.default,backward,1,1,1,2,5968,136,4
-7025,einsum_default_575,call_function,einsum.default,backward,1,2,2,1,5969,3,5
-7026,permute_1175,call_function,permute.default,backward,1,1,1,1,4,132,3
-7027,einsum_default_576,call_function,einsum.default,backward,1,2,2,1,5970,131,5
-7028,add_328,call_function,add.Tensor,unknown,,2,2,1,5993,130,10
-7029,permute_1176,call_function,permute.default,backward,1,1,1,1,5970,2,4
-7030,dtype_cast_498,call_function,dtype_cast.default,backward,1,1,1,1,5971,1,4
-7031,alias_default_1256,call_function,alias.default,backward,1,1,1,0,5972,0,3
-7032,convert_element_type_1895,call_function,convert_element_type.default,backward,1,1,1,1,5994,129,8
-7033,convert_element_type_1896,call_function,convert_element_type.default,backward,1,1,1,1,119,129,4
-7034,convert_element_type_1897,call_function,convert_element_type.default,backward,1,1,1,1,3,123,2
-7035,alias_default_1226,call_function,alias.default,backward,1,1,1,2,5995,128,4
-7036,mul_738,call_function,mul.Tensor,backward,1,2,2,1,5997,122,8
-7037,mul_739,call_function,mul.Tensor,backward,1,2,2,1,127,128,8
-7038,alias_default_1227,call_function,alias.default,backward,1,1,1,2,5998,121,4
-7039,alias_default_1228,call_function,alias.default,backward,1,1,1,3,128,127,4
-7040,mul_740,call_function,mul.Tensor,backward,1,2,2,1,6002,120,8
-7041,sum_109,call_function,sum.dim_IntList,backward,1,1,1,1,6003,119,5
-7042,div_82,call_function,div.Tensor,backward,1,1,1,1,129,119,6
-7043,mul_741,call_function,mul.Tensor,backward,1,2,2,1,6005,118,8
-7044,sub_81,call_function,sub.Tensor,backward,1,2,2,1,6006,117,10
-7045,mul_742,call_function,mul.Tensor,backward,1,2,2,1,6007,116,8
-7046,mul_743,call_function,mul.Tensor,backward,1,2,2,1,5999,4,8
-7047,sum_110,call_function,sum.dim_IntList,backward,1,1,1,1,6000,3,5
-7048,convert_element_type_1898,call_function,convert_element_type.default,backward,1,1,1,1,6008,115,6
-7049,convert_element_type_1899,call_function,convert_element_type.default,backward,1,1,1,1,6001,2,3
-7050,add_329,call_function,add.Tensor,unknown,,2,2,1,6009,114,10
-7051,dtype_cast_499,call_function,dtype_cast.default,backward,1,1,1,1,6002,1,3
-7052,alias_default_1263,call_function,alias.default,backward,1,1,1,0,6003,0,2
-7053,alias_default_1229,call_function,alias.default,unknown,,1,1,3,6010,113,4
-7054,einsum_default_577,call_function,einsum.default,backward,0,2,2,1,6011,3,5
-7055,permute_1179,call_function,permute.default,backward,0,1,1,1,4,109,3
-7056,einsum_default_578,call_function,einsum.default,backward,0,2,2,1,6012,108,5
-7057,permute_1180,call_function,permute.default,backward,0,1,1,1,6012,2,4
-7058,dtype_cast_500,call_function,dtype_cast.default,backward,0,1,1,1,6013,1,4
-7059,alias_default_1252,call_function,alias.default,backward,0,1,1,0,6014,0,3
-7060,alias_default_1230,call_function,alias.default,backward,0,1,1,2,6013,107,4
-7061,mul_744,call_function,mul.Tensor,backward,0,2,2,1,6014,95,8
-7062,mul_745,call_function,mul.Tensor,backward,0,2,2,1,6014,99,8
-7063,alias_default_1231,call_function,alias.default,backward,0,1,1,2,6015,94,4
-7064,einsum_default_579,call_function,einsum.default,backward,0,2,2,1,6016,3,5
-7065,permute_1183,call_function,permute.default,backward,0,1,1,1,4,90,3
-7066,einsum_default_580,call_function,einsum.default,backward,0,2,2,1,6017,89,5
-7067,permute_1184,call_function,permute.default,backward,0,1,1,1,6017,2,4
-7068,dtype_cast_501,call_function,dtype_cast.default,backward,0,1,1,1,6018,1,4
-7069,alias_default_1253,call_function,alias.default,backward,0,1,1,0,6019,0,3
-7070,convert_element_type_1908,call_function,convert_element_type.default,backward,0,1,1,1,6015,98,6
-7071,convert_element_type_1909,call_function,convert_element_type.default,backward,0,1,1,1,96,108,4
-7072,alias_default_1232,call_function,alias.default,backward,0,1,1,2,97,107,4
-7073,neg_55,call_function,neg.default,backward,0,1,1,1,98,106,8
-7074,exp_55,call_function,exp.default,backward,0,1,1,1,99,105,6
-7075,add_330,call_function,add.Tensor,backward,0,1,1,1,100,104,4
-7076,reciprocal_27,call_function,reciprocal.default,backward,0,1,1,1,101,103,4
-7077,mul_746,call_function,mul.Tensor,backward,0,1,1,1,102,102,6
-7078,alias_default_1233,call_function,alias.default,backward,0,1,1,2,103,101,4
-7079,mul_747,call_function,mul.Tensor,backward,0,2,2,1,6024,97,8
-7080,sub_82,call_function,sub.Tensor,backward,0,1,1,1,104,99,4
-7081,mul_748,call_function,mul.Tensor,backward,0,2,2,1,105,98,8
-7082,add_331,call_function,add.Tensor,backward,0,1,1,1,106,97,4
-7083,mul_749,call_function,mul.Tensor,backward,0,2,2,1,6028,96,8
-7084,convert_element_type_1910,call_function,convert_element_type.default,backward,0,1,1,1,6029,95,6
-7085,alias_default_1234,call_function,alias.default,backward,0,1,1,2,6030,94,4
-7086,einsum_default_581,call_function,einsum.default,backward,0,2,2,1,6031,3,5
-7087,permute_1187,call_function,permute.default,backward,0,1,1,1,4,90,3
-7088,einsum_default_582,call_function,einsum.default,backward,0,2,2,1,6032,89,5
-7089,add_332,call_function,add.Tensor,unknown,,2,2,1,6037,88,10
-7090,permute_1188,call_function,permute.default,backward,0,1,1,1,6032,2,4
-7091,dtype_cast_502,call_function,dtype_cast.default,backward,0,1,1,1,6033,1,4
-7092,alias_default_1251,call_function,alias.default,backward,0,1,1,0,6034,0,3
-7093,convert_element_type_1915,call_function,convert_element_type.default,backward,0,1,1,1,6038,87,8
-7094,convert_element_type_1916,call_function,convert_element_type.default,backward,0,1,1,1,76,87,4
-7095,convert_element_type_1917,call_function,convert_element_type.default,backward,0,1,1,1,3,81,2
-7096,alias_default_1235,call_function,alias.default,backward,0,1,1,2,6039,86,4
-7097,mul_750,call_function,mul.Tensor,backward,0,2,2,1,6041,80,8
-7098,mul_751,call_function,mul.Tensor,backward,0,2,2,1,84,86,8
-7099,alias_default_1236,call_function,alias.default,backward,0,1,1,2,6042,79,4
-7100,alias_default_1237,call_function,alias.default,backward,0,1,1,3,85,85,4
-7101,mul_752,call_function,mul.Tensor,backward,0,2,2,1,6046,78,8
-7102,sum_111,call_function,sum.dim_IntList,backward,0,1,1,1,6047,77,5
-7103,div_83,call_function,div.Tensor,backward,0,1,1,1,86,77,6
-7104,mul_753,call_function,mul.Tensor,backward,0,2,2,1,6049,76,8
-7105,sub_83,call_function,sub.Tensor,backward,0,2,2,1,6050,75,10
-7106,mul_754,call_function,mul.Tensor,backward,0,2,2,1,6051,74,8
-7107,mul_755,call_function,mul.Tensor,backward,0,2,2,1,6043,4,8
-7108,sum_112,call_function,sum.dim_IntList,backward,0,1,1,1,6044,3,5
-7109,convert_element_type_1918,call_function,convert_element_type.default,backward,0,1,1,1,6052,73,6
-7110,convert_element_type_1919,call_function,convert_element_type.default,backward,0,1,1,1,6045,2,3
-7111,add_333,call_function,add.Tensor,unknown,,2,2,1,6053,72,10
-7112,dtype_cast_503,call_function,dtype_cast.default,backward,0,1,1,1,6046,1,3
-7113,alias_default_1255,call_function,alias.default,backward,0,1,1,0,6047,0,2
-7114,alias_default_1238,call_function,alias.default,unknown,,1,1,3,6054,71,4
-7115,einsum_default_583,call_function,einsum.default,backward,0,2,2,1,6055,3,5
-7116,permute_1191,call_function,permute.default,backward,0,1,1,1,4,67,3
-7117,einsum_default_584,call_function,einsum.default,backward,0,2,2,1,6056,66,5
-7118,permute_1192,call_function,permute.default,backward,0,1,1,1,6056,2,4
-7119,dtype_cast_504,call_function,dtype_cast.default,backward,0,1,1,1,6057,1,4
-7120,alias_default_1250,call_function,alias.default,backward,0,1,1,0,6058,0,3
-7121,view_1250,call_function,view.default,backward,0,1,1,1,6057,65,4
-7122,permute_1193,call_function,permute.default,backward,0,1,1,1,6058,64,4
-7123,_scaled_dot_product_flash_attention_backward_27,call_function,_scaled_dot_product_flash_attention_backward.default,backward,0,8,8,3,6062,63,2
-7124,getitem_333,call_function,getitem,backward,0,1,1,1,6063,36,2
-7125,getitem_334,call_function,getitem,backward,0,1,1,1,6063,37,2
-7126,getitem_335,call_function,getitem,backward,0,1,1,1,6063,30,2
-7127,permute_1194,call_function,permute.default,backward,0,1,1,1,6064,29,2
-7128,permute_1195,call_function,permute.default,backward,0,1,1,1,6064,36,2
-7129,permute_1196,call_function,permute.default,backward,0,1,1,1,6064,35,2
-7130,convert_element_type_1924,call_function,convert_element_type.default,backward,0,1,1,1,6065,35,2
-7131,convert_element_type_1925,call_function,convert_element_type.default,backward,0,1,1,1,6065,34,2
-7132,view_1251,call_function,view.default,backward,0,1,1,1,6066,34,2
-7133,view_as_complex_110,call_function,view_as_complex.default,backward,0,1,1,1,6067,33,6
-7134,_conj_54,call_function,_conj.default,backward,0,1,1,1,4,34,3
-7135,clone_222,call_function,clone.default,backward,0,1,1,1,5,33,3
-7136,mul_756,call_function,mul.Tensor,backward,0,2,2,1,6070,32,8
-7137,view_1252,call_function,view.default,backward,0,1,1,1,6066,33,2
-7138,view_as_complex_111,call_function,view_as_complex.default,backward,0,1,1,1,6067,32,6
-7139,_conj_55,call_function,_conj.default,backward,0,1,1,1,4,33,3
-7140,clone_223,call_function,clone.default,backward,0,1,1,1,5,32,3
-7141,mul_757,call_function,mul.Tensor,backward,0,2,2,1,6070,31,8
-7142,view_as_real_110,call_function,view_as_real.default,backward,0,1,1,1,6071,31,6
-7143,view_1253,call_function,view.default,backward,0,1,1,1,6072,30,6
-7144,convert_element_type_1926,call_function,convert_element_type.default,backward,0,1,1,1,6073,29,6
-7145,view_as_real_111,call_function,view_as_real.default,backward,0,1,1,1,6071,30,6
-7146,view_1254,call_function,view.default,backward,0,1,1,1,6072,29,6
-7147,convert_element_type_1927,call_function,convert_element_type.default,backward,0,1,1,1,6073,28,6
-7148,view_1255,call_function,view.default,backward,0,1,1,1,6065,28,2
-7149,view_1256,call_function,view.default,backward,0,1,1,1,6074,28,5
-7150,view_1257,call_function,view.default,backward,0,1,1,1,6074,27,5
-7151,alias_default_1239,call_function,alias.default,backward,0,1,1,2,6066,27,4
-7152,einsum_default_585,call_function,einsum.default,backward,0,2,2,1,6067,3,5
-7153,permute_1199,call_function,permute.default,backward,0,1,1,1,4,23,3
-7154,einsum_default_586,call_function,einsum.default,backward,0,2,2,1,6068,22,5
-7155,permute_1200,call_function,permute.default,backward,0,1,1,1,6068,2,4
-7156,dtype_cast_505,call_function,dtype_cast.default,backward,0,1,1,1,6069,1,4
-7157,alias_default_1249,call_function,alias.default,backward,0,1,1,0,6070,0,3
-7158,alias_default_1240,call_function,alias.default,backward,0,1,1,2,6075,27,4
-7159,einsum_default_587,call_function,einsum.default,backward,0,2,2,1,6076,3,5
-7160,permute_1203,call_function,permute.default,backward,0,1,1,1,4,23,3
-7161,einsum_default_588,call_function,einsum.default,backward,0,2,2,1,6077,22,5
-7162,add_334,call_function,add.Tensor,unknown,,2,2,1,6084,21,10
-7163,permute_1204,call_function,permute.default,backward,0,1,1,1,6077,2,4
-7164,dtype_cast_506,call_function,dtype_cast.default,backward,0,1,1,1,6078,1,4
-7165,alias_default_1248,call_function,alias.default,backward,0,1,1,0,6079,0,3
-7166,alias_default_1241,call_function,alias.default,backward,0,1,1,2,6075,26,4
-7167,einsum_default_589,call_function,einsum.default,backward,0,2,2,1,6076,3,5
-7168,permute_1207,call_function,permute.default,backward,0,1,1,1,4,22,3
-7169,einsum_default_590,call_function,einsum.default,backward,0,2,2,1,6077,21,5
-7170,add_335,call_function,add.Tensor,unknown,,2,2,1,6100,20,10
-7171,permute_1208,call_function,permute.default,backward,0,1,1,1,6077,2,4
-7172,dtype_cast_507,call_function,dtype_cast.default,backward,0,1,1,1,6078,1,4
-7173,alias_default_1247,call_function,alias.default,backward,0,1,1,0,6079,0,3
-7174,convert_element_type_1940,call_function,convert_element_type.default,backward,0,1,1,1,6101,19,8
-7175,convert_element_type_1941,call_function,convert_element_type.default,backward,0,1,1,1,7,19,4
-7176,convert_element_type_1942,call_function,convert_element_type.default,backward,0,1,1,1,3,13,2
-7177,alias_default_1242,call_function,alias.default,backward,0,1,1,2,6102,18,4
-7178,mul_758,call_function,mul.Tensor,backward,0,2,2,1,6104,12,8
-7179,mul_759,call_function,mul.Tensor,backward,0,2,2,1,15,18,8
-7180,alias_default_1243,call_function,alias.default,backward,0,1,1,2,6105,11,4
-7181,alias_default_1244,call_function,alias.default,backward,0,1,1,3,16,17,4
-7182,mul_760,call_function,mul.Tensor,backward,0,2,2,1,6109,10,8
-7183,sum_113,call_function,sum.dim_IntList,backward,0,1,1,1,6110,9,5
-7184,div_84,call_function,div.Tensor,backward,0,1,1,1,17,9,6
-7185,mul_761,call_function,mul.Tensor,backward,0,2,2,1,6112,8,8
-7186,sub_84,call_function,sub.Tensor,backward,0,2,2,1,6113,7,10
-7187,mul_762,call_function,mul.Tensor,backward,0,2,2,1,6114,6,8
-7188,mul_763,call_function,mul.Tensor,backward,0,2,2,1,6106,4,8
-7189,sum_114,call_function,sum.dim_IntList,backward,0,1,1,1,6107,3,5
-7190,convert_element_type_1943,call_function,convert_element_type.default,backward,0,1,1,1,6115,5,6
-7191,convert_element_type_1944,call_function,convert_element_type.default,backward,0,1,1,1,6108,2,3
-7192,add_336,call_function,add.Tensor,unknown,,2,2,1,6116,4,10
-7193,dtype_cast_508,call_function,dtype_cast.default,backward,0,1,1,1,6109,1,3
-7194,alias_default_1254,call_function,alias.default,backward,0,1,1,0,6110,0,2
-7195,embedding_dense_backward,call_function,embedding_dense_backward.default,backward,,2,2,1,6117,3,5
-7196,dtype_cast_509,call_function,dtype_cast.default,backward,,1,1,1,6118,2,3
-7197,add_337,call_function,add.Tensor,unknown,,2,2,1,6126,1,9
-7198,alias_default_1246,call_function,alias.default,unknown,,1,1,0,6127,0,3
diff --git a/profile_results/real_llama3_3b_dag_summary.json b/profile_results/real_llama3_3b_dag_summary.json
deleted file mode 100644
index 93434ea9..00000000
--- a/profile_results/real_llama3_3b_dag_summary.json
+++ /dev/null
@@ -1,883 +0,0 @@
-{
-  "branch_points": 1301,
-  "dag_edges": 8805,
-  "direct_dependency_histogram": {
-    "0": 257,
-    "1": 5275,
-    "2": 1611,
-    "3": 28,
-    "8": 28
-  },
-  "direct_offspring_histogram": {
-    "0": 255,
-    "1": 5643,
-    "2": 934,
-    "3": 254,
-    "4": 84,
-    "6": 28,
-    "28": 1
-  },
-  "ilp_nodes": 7199,
-  "max_ancestor_count": 6127,
-  "max_descendant_count": 5943,
-  "max_direct_dependency_nodes": 8,
-  "max_direct_offspring_nodes": 28,
-  "merge_points": 1667,
-  "merge_points_csv": "profile_results/real_llama3_3b_merge_points.csv",
-  "mesh": "1D 64",
-  "model": "LLaMA3 3B",
-  "node_stats_csv": "profile_results/real_llama3_3b_dag_node_stats.csv",
-  "top_fanout_points": [
-    {
-      "ancestor_count": 1,
-      "descendant_count": 5942,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 28,
-      "idx": 296,
-      "layer": "",
-      "name": "alias_default_1",
-      "op": "call_function",
-      "phase": "unknown",
-      "strategy_count": 3,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 20,
-      "descendant_count": 5788,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 276,
-      "layer": 0,
-      "name": "alias_default_8",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 132,
-      "descendant_count": 5692,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 381,
-      "layer": 1,
-      "name": "alias_default_36",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 242,
-      "descendant_count": 5596,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 485,
-      "layer": 2,
-      "name": "alias_default_64",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 352,
-      "descendant_count": 5500,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 589,
-      "layer": 3,
-      "name": "alias_default_92",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 462,
-      "descendant_count": 5404,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 693,
-      "layer": 4,
-      "name": "alias_default_120",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 572,
-      "descendant_count": 5308,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 797,
-      "layer": 5,
-      "name": "alias_default_148",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 682,
-      "descendant_count": 5212,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 901,
-      "layer": 6,
-      "name": "alias_default_176",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 792,
-      "descendant_count": 5116,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 1005,
-      "layer": 7,
-      "name": "alias_default_204",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 902,
-      "descendant_count": 5020,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 1109,
-      "layer": 8,
-      "name": "alias_default_232",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 1012,
-      "descendant_count": 4924,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 1213,
-      "layer": 9,
-      "name": "alias_default_260",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 1122,
-      "descendant_count": 4828,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 1317,
-      "layer": 10,
-      "name": "alias_default_288",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 1232,
-      "descendant_count": 4732,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 1421,
-      "layer": 11,
-      "name": "alias_default_316",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 1342,
-      "descendant_count": 4636,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 1525,
-      "layer": 12,
-      "name": "alias_default_344",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 1452,
-      "descendant_count": 4540,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 1629,
-      "layer": 13,
-      "name": "alias_default_372",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 1562,
-      "descendant_count": 4444,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 1733,
-      "layer": 14,
-      "name": "alias_default_400",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 1672,
-      "descendant_count": 4348,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 1837,
-      "layer": 15,
-      "name": "alias_default_428",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 1782,
-      "descendant_count": 4252,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 1941,
-      "layer": 16,
-      "name": "alias_default_456",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 1892,
-      "descendant_count": 4156,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 2045,
-      "layer": 17,
-      "name": "alias_default_484",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 2002,
-      "descendant_count": 4060,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 2149,
-      "layer": 18,
-      "name": "alias_default_512",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 2112,
-      "descendant_count": 3964,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 2253,
-      "layer": 19,
-      "name": "alias_default_540",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 2222,
-      "descendant_count": 3868,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 2357,
-      "layer": 20,
-      "name": "alias_default_568",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 2332,
-      "descendant_count": 3772,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 2461,
-      "layer": 21,
-      "name": "alias_default_596",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 2442,
-      "descendant_count": 3676,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 2565,
-      "layer": 22,
-      "name": "alias_default_624",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 2552,
-      "descendant_count": 3580,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 2669,
-      "layer": 23,
-      "name": "alias_default_652",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 2662,
-      "descendant_count": 3484,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 2773,
-      "layer": 24,
-      "name": "alias_default_680",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 2772,
-      "descendant_count": 3388,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 2877,
-      "layer": 25,
-      "name": "alias_default_708",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 2882,
-      "descendant_count": 3292,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 2981,
-      "layer": 26,
-      "name": "alias_default_736",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 2992,
-      "descendant_count": 3196,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 6,
-      "idx": 3085,
-      "layer": 27,
-      "name": "alias_default_764",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 4,
-      "target": "alias.default"
-    },
-    {
-      "ancestor_count": 3,
-      "descendant_count": 5778,
-      "direct_dependency_args": 1,
-      "direct_dependency_nodes": 1,
-      "direct_offspring_nodes": 4,
-      "idx": 298,
-      "layer": 0,
-      "name": "alias_default_12",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 3,
-      "target": "alias.default"
-    }
-  ],
-  "top_merge_points": [
-    {
-      "ancestor_count": 3173,
-      "descendant_count": 3033,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 3289,
-      "layer": 27,
-      "name": "_scaled_dot_product_flash_attention_backward",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 3280,
-      "descendant_count": 2923,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 3431,
-      "layer": 26,
-      "name": "_scaled_dot_product_flash_attention_backward_1",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 3387,
-      "descendant_count": 2813,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 3573,
-      "layer": 25,
-      "name": "_scaled_dot_product_flash_attention_backward_2",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 3494,
-      "descendant_count": 2703,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 3715,
-      "layer": 24,
-      "name": "_scaled_dot_product_flash_attention_backward_3",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 3601,
-      "descendant_count": 2593,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 3857,
-      "layer": 23,
-      "name": "_scaled_dot_product_flash_attention_backward_4",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 3708,
-      "descendant_count": 2483,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 3999,
-      "layer": 22,
-      "name": "_scaled_dot_product_flash_attention_backward_5",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 3815,
-      "descendant_count": 2373,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 4141,
-      "layer": 21,
-      "name": "_scaled_dot_product_flash_attention_backward_6",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 3922,
-      "descendant_count": 2263,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 4283,
-      "layer": 20,
-      "name": "_scaled_dot_product_flash_attention_backward_7",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 4029,
-      "descendant_count": 2153,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 4425,
-      "layer": 19,
-      "name": "_scaled_dot_product_flash_attention_backward_8",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 4136,
-      "descendant_count": 2043,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 4567,
-      "layer": 18,
-      "name": "_scaled_dot_product_flash_attention_backward_9",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 4243,
-      "descendant_count": 1933,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 4709,
-      "layer": 17,
-      "name": "_scaled_dot_product_flash_attention_backward_10",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 4350,
-      "descendant_count": 1823,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 4851,
-      "layer": 16,
-      "name": "_scaled_dot_product_flash_attention_backward_11",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 4457,
-      "descendant_count": 1713,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 4993,
-      "layer": 15,
-      "name": "_scaled_dot_product_flash_attention_backward_12",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 4564,
-      "descendant_count": 1603,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 5135,
-      "layer": 14,
-      "name": "_scaled_dot_product_flash_attention_backward_13",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 4671,
-      "descendant_count": 1493,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 5277,
-      "layer": 13,
-      "name": "_scaled_dot_product_flash_attention_backward_14",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 4778,
-      "descendant_count": 1383,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 5419,
-      "layer": 12,
-      "name": "_scaled_dot_product_flash_attention_backward_15",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 4885,
-      "descendant_count": 1273,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 5561,
-      "layer": 11,
-      "name": "_scaled_dot_product_flash_attention_backward_16",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 4992,
-      "descendant_count": 1163,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 5703,
-      "layer": 10,
-      "name": "_scaled_dot_product_flash_attention_backward_17",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 5099,
-      "descendant_count": 1053,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 5845,
-      "layer": 9,
-      "name": "_scaled_dot_product_flash_attention_backward_18",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 5206,
-      "descendant_count": 943,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 5987,
-      "layer": 8,
-      "name": "_scaled_dot_product_flash_attention_backward_19",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 5313,
-      "descendant_count": 833,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 6129,
-      "layer": 7,
-      "name": "_scaled_dot_product_flash_attention_backward_20",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 5420,
-      "descendant_count": 723,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 6271,
-      "layer": 6,
-      "name": "_scaled_dot_product_flash_attention_backward_21",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 5527,
-      "descendant_count": 613,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 6413,
-      "layer": 5,
-      "name": "_scaled_dot_product_flash_attention_backward_22",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 5634,
-      "descendant_count": 503,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 6555,
-      "layer": 4,
-      "name": "_scaled_dot_product_flash_attention_backward_23",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 5741,
-      "descendant_count": 393,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 6697,
-      "layer": 3,
-      "name": "_scaled_dot_product_flash_attention_backward_24",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 5848,
-      "descendant_count": 283,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 6839,
-      "layer": 2,
-      "name": "_scaled_dot_product_flash_attention_backward_25",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 5955,
-      "descendant_count": 173,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 6981,
-      "layer": 1,
-      "name": "_scaled_dot_product_flash_attention_backward_26",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 6062,
-      "descendant_count": 63,
-      "direct_dependency_args": 8,
-      "direct_dependency_nodes": 8,
-      "direct_offspring_nodes": 3,
-      "idx": 7123,
-      "layer": 0,
-      "name": "_scaled_dot_product_flash_attention_backward_27",
-      "op": "call_function",
-      "phase": "backward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention_backward.default"
-    },
-    {
-      "ancestor_count": 63,
-      "descendant_count": 5761,
-      "direct_dependency_args": 3,
-      "direct_dependency_nodes": 3,
-      "direct_offspring_nodes": 4,
-      "idx": 313,
-      "layer": 0,
-      "name": "_scaled_dot_product_flash_attention",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention.default"
-    },
-    {
-      "ancestor_count": 173,
-      "descendant_count": 5665,
-      "direct_dependency_args": 3,
-      "direct_dependency_nodes": 3,
-      "direct_offspring_nodes": 4,
-      "idx": 417,
-      "layer": 1,
-      "name": "_scaled_dot_product_flash_attention_1",
-      "op": "call_function",
-      "phase": "forward",
-      "strategy_count": 2,
-      "target": "_scaled_dot_product_flash_attention.default"
-    }
-  ],
-  "trace_and_optimizer_build_s": 38.44014171184972,
-  "treewidth_upper_bounds": {
-    "moralized_edges": 11200,
-    "moralized_min_degree": 10,
-    "moralized_min_fill": 8,
-    "undirected_edges": 8805,
-    "undirected_min_degree": 9,
-    "undirected_min_fill": 6
-  }
-}
\ No newline at end of file
diff --git a/profile_results/real_llama3_3b_merge_points.csv b/profile_results/real_llama3_3b_merge_points.csv
deleted file mode 100644
index 4418765e..00000000
--- a/profile_results/real_llama3_3b_merge_points.csv
+++ /dev/null
@@ -1,1668 +0,0 @@
-idx,name,op,target,phase,layer,direct_dependency_args,direct_dependency_nodes,direct_offspring_nodes,ancestor_count,descendant_count,strategy_count
-3289,_scaled_dot_product_flash_attention_backward,call_function,_scaled_dot_product_flash_attention_backward.default,backward,27,8,8,3,3173,3033,2
-3431,_scaled_dot_product_flash_attention_backward_1,call_function,_scaled_dot_product_flash_attention_backward.default,backward,26,8,8,3,3280,2923,2
-3573,_scaled_dot_product_flash_attention_backward_2,call_function,_scaled_dot_product_flash_attention_backward.default,backward,25,8,8,3,3387,2813,2
-3715,_scaled_dot_product_flash_attention_backward_3,call_function,_scaled_dot_product_flash_attention_backward.default,backward,24,8,8,3,3494,2703,2
-3857,_scaled_dot_product_flash_attention_backward_4,call_function,_scaled_dot_product_flash_attention_backward.default,backward,23,8,8,3,3601,2593,2
-3999,_scaled_dot_product_flash_attention_backward_5,call_function,_scaled_dot_product_flash_attention_backward.default,backward,22,8,8,3,3708,2483,2
-4141,_scaled_dot_product_flash_attention_backward_6,call_function,_scaled_dot_product_flash_attention_backward.default,backward,21,8,8,3,3815,2373,2
-4283,_scaled_dot_product_flash_attention_backward_7,call_function,_scaled_dot_product_flash_attention_backward.default,backward,20,8,8,3,3922,2263,2
-4425,_scaled_dot_product_flash_attention_backward_8,call_function,_scaled_dot_product_flash_attention_backward.default,backward,19,8,8,3,4029,2153,2
-4567,_scaled_dot_product_flash_attention_backward_9,call_function,_scaled_dot_product_flash_attention_backward.default,backward,18,8,8,3,4136,2043,2
-4709,_scaled_dot_product_flash_attention_backward_10,call_function,_scaled_dot_product_flash_attention_backward.default,backward,17,8,8,3,4243,1933,2
-4851,_scaled_dot_product_flash_attention_backward_11,call_function,_scaled_dot_product_flash_attention_backward.default,backward,16,8,8,3,4350,1823,2
-4993,_scaled_dot_product_flash_attention_backward_12,call_function,_scaled_dot_product_flash_attention_backward.default,backward,15,8,8,3,4457,1713,2
-5135,_scaled_dot_product_flash_attention_backward_13,call_function,_scaled_dot_product_flash_attention_backward.default,backward,14,8,8,3,4564,1603,2
-5277,_scaled_dot_product_flash_attention_backward_14,call_function,_scaled_dot_product_flash_attention_backward.default,backward,13,8,8,3,4671,1493,2
-5419,_scaled_dot_product_flash_attention_backward_15,call_function,_scaled_dot_product_flash_attention_backward.default,backward,12,8,8,3,4778,1383,2
-5561,_scaled_dot_product_flash_attention_backward_16,call_function,_scaled_dot_product_flash_attention_backward.default,backward,11,8,8,3,4885,1273,2
-5703,_scaled_dot_product_flash_attention_backward_17,call_function,_scaled_dot_product_flash_attention_backward.default,backward,10,8,8,3,4992,1163,2
-5845,_scaled_dot_product_flash_attention_backward_18,call_function,_scaled_dot_product_flash_attention_backward.default,backward,9,8,8,3,5099,1053,2
-5987,_scaled_dot_product_flash_attention_backward_19,call_function,_scaled_dot_product_flash_attention_backward.default,backward,8,8,8,3,5206,943,2
-6129,_scaled_dot_product_flash_attention_backward_20,call_function,_scaled_dot_product_flash_attention_backward.default,backward,7,8,8,3,5313,833,2
-6271,_scaled_dot_product_flash_attention_backward_21,call_function,_scaled_dot_product_flash_attention_backward.default,backward,6,8,8,3,5420,723,2
-6413,_scaled_dot_product_flash_attention_backward_22,call_function,_scaled_dot_product_flash_attention_backward.default,backward,5,8,8,3,5527,613,2
-6555,_scaled_dot_product_flash_attention_backward_23,call_function,_scaled_dot_product_flash_attention_backward.default,backward,4,8,8,3,5634,503,2
-6697,_scaled_dot_product_flash_attention_backward_24,call_function,_scaled_dot_product_flash_attention_backward.default,backward,3,8,8,3,5741,393,2
-6839,_scaled_dot_product_flash_attention_backward_25,call_function,_scaled_dot_product_flash_attention_backward.default,backward,2,8,8,3,5848,283,2
-6981,_scaled_dot_product_flash_attention_backward_26,call_function,_scaled_dot_product_flash_attention_backward.default,backward,1,8,8,3,5955,173,2
-7123,_scaled_dot_product_flash_attention_backward_27,call_function,_scaled_dot_product_flash_attention_backward.default,backward,0,8,8,3,6062,63,2
-313,_scaled_dot_product_flash_attention,call_function,_scaled_dot_product_flash_attention.default,forward,0,3,3,4,63,5761,2
-417,_scaled_dot_product_flash_attention_1,call_function,_scaled_dot_product_flash_attention.default,forward,1,3,3,4,173,5665,2
-521,_scaled_dot_product_flash_attention_2,call_function,_scaled_dot_product_flash_attention.default,forward,2,3,3,4,283,5569,2
-625,_scaled_dot_product_flash_attention_3,call_function,_scaled_dot_product_flash_attention.default,forward,3,3,3,4,393,5473,2
-729,_scaled_dot_product_flash_attention_4,call_function,_scaled_dot_product_flash_attention.default,forward,4,3,3,4,503,5377,2
-833,_scaled_dot_product_flash_attention_5,call_function,_scaled_dot_product_flash_attention.default,forward,5,3,3,4,613,5281,2
-937,_scaled_dot_product_flash_attention_6,call_function,_scaled_dot_product_flash_attention.default,forward,6,3,3,4,723,5185,2
-1041,_scaled_dot_product_flash_attention_7,call_function,_scaled_dot_product_flash_attention.default,forward,7,3,3,4,833,5089,2
-1145,_scaled_dot_product_flash_attention_8,call_function,_scaled_dot_product_flash_attention.default,forward,8,3,3,4,943,4993,2
-1249,_scaled_dot_product_flash_attention_9,call_function,_scaled_dot_product_flash_attention.default,forward,9,3,3,4,1053,4897,2
-1353,_scaled_dot_product_flash_attention_10,call_function,_scaled_dot_product_flash_attention.default,forward,10,3,3,4,1163,4801,2
-1457,_scaled_dot_product_flash_attention_11,call_function,_scaled_dot_product_flash_attention.default,forward,11,3,3,4,1273,4705,2
-1561,_scaled_dot_product_flash_attention_12,call_function,_scaled_dot_product_flash_attention.default,forward,12,3,3,4,1383,4609,2
-1665,_scaled_dot_product_flash_attention_13,call_function,_scaled_dot_product_flash_attention.default,forward,13,3,3,4,1493,4513,2
-1769,_scaled_dot_product_flash_attention_14,call_function,_scaled_dot_product_flash_attention.default,forward,14,3,3,4,1603,4417,2
-1873,_scaled_dot_product_flash_attention_15,call_function,_scaled_dot_product_flash_attention.default,forward,15,3,3,4,1713,4321,2
-1977,_scaled_dot_product_flash_attention_16,call_function,_scaled_dot_product_flash_attention.default,forward,16,3,3,4,1823,4225,2
-2081,_scaled_dot_product_flash_attention_17,call_function,_scaled_dot_product_flash_attention.default,forward,17,3,3,4,1933,4129,2
-2185,_scaled_dot_product_flash_attention_18,call_function,_scaled_dot_product_flash_attention.default,forward,18,3,3,4,2043,4033,2
-2289,_scaled_dot_product_flash_attention_19,call_function,_scaled_dot_product_flash_attention.default,forward,19,3,3,4,2153,3937,2
-2393,_scaled_dot_product_flash_attention_20,call_function,_scaled_dot_product_flash_attention.default,forward,20,3,3,4,2263,3841,2
-2497,_scaled_dot_product_flash_attention_21,call_function,_scaled_dot_product_flash_attention.default,forward,21,3,3,4,2373,3745,2
-2601,_scaled_dot_product_flash_attention_22,call_function,_scaled_dot_product_flash_attention.default,forward,22,3,3,4,2483,3649,2
-2705,_scaled_dot_product_flash_attention_23,call_function,_scaled_dot_product_flash_attention.default,forward,23,3,3,4,2593,3553,2
-2809,_scaled_dot_product_flash_attention_24,call_function,_scaled_dot_product_flash_attention.default,forward,24,3,3,4,2703,3457,2
-2913,_scaled_dot_product_flash_attention_25,call_function,_scaled_dot_product_flash_attention.default,forward,25,3,3,4,2813,3361,2
-3017,_scaled_dot_product_flash_attention_26,call_function,_scaled_dot_product_flash_attention.default,forward,26,3,3,4,2923,3265,2
-3121,_scaled_dot_product_flash_attention_27,call_function,_scaled_dot_product_flash_attention.default,forward,27,3,3,4,3033,3169,2
-260,embedding,call_function,embedding.default,forward,,2,2,1,5,5804,5
-270,mul,call_function,mul.Tensor,forward,0,2,2,1,14,5791,8
-272,mul_1,call_function,mul.Tensor,forward,0,2,2,1,18,5790,8
-278,einsum_default,call_function,einsum.default,forward,0,2,2,1,25,5772,5
-282,einsum_default_1,call_function,einsum.default,forward,0,2,2,1,25,5772,5
-299,mul_2,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8
-302,mul_3,call_function,mul.Tensor,forward,0,2,2,1,34,5767,8
-286,einsum_default_2,call_function,einsum.default,forward,0,2,2,1,25,5765,5
-325,einsum_default_3,call_function,einsum.default,forward,0,2,2,1,73,5752,5
-326,add_1,call_function,add.Tensor,forward,0,2,2,1,74,5751,10
-336,mul_4,call_function,mul.Tensor,forward,0,2,2,1,83,5738,8
-338,mul_5,call_function,mul.Tensor,forward,0,2,2,1,87,5737,8
-344,einsum_default_4,call_function,einsum.default,forward,0,2,2,1,94,5732,5
-351,div,call_function,div.Tensor,forward,0,2,2,1,101,5714,6
-356,einsum_default_5,call_function,einsum.default,forward,0,2,2,1,94,5713,5
-359,mul_6,call_function,mul.Tensor,forward,0,2,2,1,110,5711,8
-364,einsum_default_6,call_function,einsum.default,forward,0,2,2,1,116,5709,5
-365,add_4,call_function,add.Tensor,forward,0,2,2,1,117,5708,10
-375,mul_7,call_function,mul.Tensor,forward,1,2,2,1,126,5695,8
-377,mul_8,call_function,mul.Tensor,forward,1,2,2,1,130,5694,8
-383,einsum_default_7,call_function,einsum.default,forward,1,2,2,1,137,5676,5
-387,einsum_default_8,call_function,einsum.default,forward,1,2,2,1,137,5676,5
-403,mul_9,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8
-406,mul_10,call_function,mul.Tensor,forward,1,2,2,1,144,5671,8
-391,einsum_default_9,call_function,einsum.default,forward,1,2,2,1,137,5669,5
-429,einsum_default_10,call_function,einsum.default,forward,1,2,2,1,183,5656,5
-430,add_6,call_function,add.Tensor,forward,1,2,2,1,184,5655,10
-440,mul_11,call_function,mul.Tensor,forward,1,2,2,1,193,5642,8
-442,mul_12,call_function,mul.Tensor,forward,1,2,2,1,197,5641,8
-448,einsum_default_11,call_function,einsum.default,forward,1,2,2,1,204,5636,5
-455,div_1,call_function,div.Tensor,forward,1,2,2,1,211,5618,6
-460,einsum_default_12,call_function,einsum.default,forward,1,2,2,1,204,5617,5
-463,mul_13,call_function,mul.Tensor,forward,1,2,2,1,220,5615,8
-468,einsum_default_13,call_function,einsum.default,forward,1,2,2,1,226,5613,5
-469,add_9,call_function,add.Tensor,forward,1,2,2,1,227,5612,10
-479,mul_14,call_function,mul.Tensor,forward,2,2,2,1,236,5599,8
-481,mul_15,call_function,mul.Tensor,forward,2,2,2,1,240,5598,8
-487,einsum_default_14,call_function,einsum.default,forward,2,2,2,1,247,5580,5
-491,einsum_default_15,call_function,einsum.default,forward,2,2,2,1,247,5580,5
-507,mul_16,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8
-510,mul_17,call_function,mul.Tensor,forward,2,2,2,1,254,5575,8
-495,einsum_default_16,call_function,einsum.default,forward,2,2,2,1,247,5573,5
-533,einsum_default_17,call_function,einsum.default,forward,2,2,2,1,293,5560,5
-534,add_11,call_function,add.Tensor,forward,2,2,2,1,294,5559,10
-544,mul_18,call_function,mul.Tensor,forward,2,2,2,1,303,5546,8
-546,mul_19,call_function,mul.Tensor,forward,2,2,2,1,307,5545,8
-552,einsum_default_18,call_function,einsum.default,forward,2,2,2,1,314,5540,5
-559,div_2,call_function,div.Tensor,forward,2,2,2,1,321,5522,6
-564,einsum_default_19,call_function,einsum.default,forward,2,2,2,1,314,5521,5
-567,mul_20,call_function,mul.Tensor,forward,2,2,2,1,330,5519,8
-572,einsum_default_20,call_function,einsum.default,forward,2,2,2,1,336,5517,5
-573,add_14,call_function,add.Tensor,forward,2,2,2,1,337,5516,10
-583,mul_21,call_function,mul.Tensor,forward,3,2,2,1,346,5503,8
-585,mul_22,call_function,mul.Tensor,forward,3,2,2,1,350,5502,8
-591,einsum_default_21,call_function,einsum.default,forward,3,2,2,1,357,5484,5
-595,einsum_default_22,call_function,einsum.default,forward,3,2,2,1,357,5484,5
-611,mul_23,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8
-614,mul_24,call_function,mul.Tensor,forward,3,2,2,1,364,5479,8
-599,einsum_default_23,call_function,einsum.default,forward,3,2,2,1,357,5477,5
-637,einsum_default_24,call_function,einsum.default,forward,3,2,2,1,403,5464,5
-638,add_16,call_function,add.Tensor,forward,3,2,2,1,404,5463,10
-648,mul_25,call_function,mul.Tensor,forward,3,2,2,1,413,5450,8
-650,mul_26,call_function,mul.Tensor,forward,3,2,2,1,417,5449,8
-656,einsum_default_25,call_function,einsum.default,forward,3,2,2,1,424,5444,5
-663,div_3,call_function,div.Tensor,forward,3,2,2,1,431,5426,6
-668,einsum_default_26,call_function,einsum.default,forward,3,2,2,1,424,5425,5
-671,mul_27,call_function,mul.Tensor,forward,3,2,2,1,440,5423,8
-676,einsum_default_27,call_function,einsum.default,forward,3,2,2,1,446,5421,5
-677,add_19,call_function,add.Tensor,forward,3,2,2,1,447,5420,10
-687,mul_28,call_function,mul.Tensor,forward,4,2,2,1,456,5407,8
-689,mul_29,call_function,mul.Tensor,forward,4,2,2,1,460,5406,8
-695,einsum_default_28,call_function,einsum.default,forward,4,2,2,1,467,5388,5
-699,einsum_default_29,call_function,einsum.default,forward,4,2,2,1,467,5388,5
-715,mul_30,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8
-718,mul_31,call_function,mul.Tensor,forward,4,2,2,1,474,5383,8
-703,einsum_default_30,call_function,einsum.default,forward,4,2,2,1,467,5381,5
-741,einsum_default_31,call_function,einsum.default,forward,4,2,2,1,513,5368,5
-742,add_21,call_function,add.Tensor,forward,4,2,2,1,514,5367,10
-752,mul_32,call_function,mul.Tensor,forward,4,2,2,1,523,5354,8
-754,mul_33,call_function,mul.Tensor,forward,4,2,2,1,527,5353,8
-760,einsum_default_32,call_function,einsum.default,forward,4,2,2,1,534,5348,5
-767,div_4,call_function,div.Tensor,forward,4,2,2,1,541,5330,6
-772,einsum_default_33,call_function,einsum.default,forward,4,2,2,1,534,5329,5
-775,mul_34,call_function,mul.Tensor,forward,4,2,2,1,550,5327,8
-780,einsum_default_34,call_function,einsum.default,forward,4,2,2,1,556,5325,5
-781,add_24,call_function,add.Tensor,forward,4,2,2,1,557,5324,10
-791,mul_35,call_function,mul.Tensor,forward,5,2,2,1,566,5311,8
-793,mul_36,call_function,mul.Tensor,forward,5,2,2,1,570,5310,8
-799,einsum_default_35,call_function,einsum.default,forward,5,2,2,1,577,5292,5
-803,einsum_default_36,call_function,einsum.default,forward,5,2,2,1,577,5292,5
-819,mul_37,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8
-822,mul_38,call_function,mul.Tensor,forward,5,2,2,1,584,5287,8
-807,einsum_default_37,call_function,einsum.default,forward,5,2,2,1,577,5285,5
-845,einsum_default_38,call_function,einsum.default,forward,5,2,2,1,623,5272,5
-846,add_26,call_function,add.Tensor,forward,5,2,2,1,624,5271,10
-856,mul_39,call_function,mul.Tensor,forward,5,2,2,1,633,5258,8
-858,mul_40,call_function,mul.Tensor,forward,5,2,2,1,637,5257,8
-864,einsum_default_39,call_function,einsum.default,forward,5,2,2,1,644,5252,5
-871,div_5,call_function,div.Tensor,forward,5,2,2,1,651,5234,6
-876,einsum_default_40,call_function,einsum.default,forward,5,2,2,1,644,5233,5
-879,mul_41,call_function,mul.Tensor,forward,5,2,2,1,660,5231,8
-884,einsum_default_41,call_function,einsum.default,forward,5,2,2,1,666,5229,5
-885,add_29,call_function,add.Tensor,forward,5,2,2,1,667,5228,10
-895,mul_42,call_function,mul.Tensor,forward,6,2,2,1,676,5215,8
-897,mul_43,call_function,mul.Tensor,forward,6,2,2,1,680,5214,8
-903,einsum_default_42,call_function,einsum.default,forward,6,2,2,1,687,5196,5
-907,einsum_default_43,call_function,einsum.default,forward,6,2,2,1,687,5196,5
-923,mul_44,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8
-926,mul_45,call_function,mul.Tensor,forward,6,2,2,1,694,5191,8
-911,einsum_default_44,call_function,einsum.default,forward,6,2,2,1,687,5189,5
-949,einsum_default_45,call_function,einsum.default,forward,6,2,2,1,733,5176,5
-950,add_31,call_function,add.Tensor,forward,6,2,2,1,734,5175,10
-960,mul_46,call_function,mul.Tensor,forward,6,2,2,1,743,5162,8
-962,mul_47,call_function,mul.Tensor,forward,6,2,2,1,747,5161,8
-968,einsum_default_46,call_function,einsum.default,forward,6,2,2,1,754,5156,5
-975,div_6,call_function,div.Tensor,forward,6,2,2,1,761,5138,6
-980,einsum_default_47,call_function,einsum.default,forward,6,2,2,1,754,5137,5
-983,mul_48,call_function,mul.Tensor,forward,6,2,2,1,770,5135,8
-988,einsum_default_48,call_function,einsum.default,forward,6,2,2,1,776,5133,5
-989,add_34,call_function,add.Tensor,forward,6,2,2,1,777,5132,10
-999,mul_49,call_function,mul.Tensor,forward,7,2,2,1,786,5119,8
-1001,mul_50,call_function,mul.Tensor,forward,7,2,2,1,790,5118,8
-1007,einsum_default_49,call_function,einsum.default,forward,7,2,2,1,797,5100,5
-1011,einsum_default_50,call_function,einsum.default,forward,7,2,2,1,797,5100,5
-1027,mul_51,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8
-1030,mul_52,call_function,mul.Tensor,forward,7,2,2,1,804,5095,8
-1015,einsum_default_51,call_function,einsum.default,forward,7,2,2,1,797,5093,5
-1053,einsum_default_52,call_function,einsum.default,forward,7,2,2,1,843,5080,5
-1054,add_36,call_function,add.Tensor,forward,7,2,2,1,844,5079,10
-1064,mul_53,call_function,mul.Tensor,forward,7,2,2,1,853,5066,8
-1066,mul_54,call_function,mul.Tensor,forward,7,2,2,1,857,5065,8
-1072,einsum_default_53,call_function,einsum.default,forward,7,2,2,1,864,5060,5
-1079,div_7,call_function,div.Tensor,forward,7,2,2,1,871,5042,6
-1084,einsum_default_54,call_function,einsum.default,forward,7,2,2,1,864,5041,5
-1087,mul_55,call_function,mul.Tensor,forward,7,2,2,1,880,5039,8
-1092,einsum_default_55,call_function,einsum.default,forward,7,2,2,1,886,5037,5
-1093,add_39,call_function,add.Tensor,forward,7,2,2,1,887,5036,10
-1103,mul_56,call_function,mul.Tensor,forward,8,2,2,1,896,5023,8
-1105,mul_57,call_function,mul.Tensor,forward,8,2,2,1,900,5022,8
-1111,einsum_default_56,call_function,einsum.default,forward,8,2,2,1,907,5004,5
-1115,einsum_default_57,call_function,einsum.default,forward,8,2,2,1,907,5004,5
-1131,mul_58,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8
-1134,mul_59,call_function,mul.Tensor,forward,8,2,2,1,914,4999,8
-1119,einsum_default_58,call_function,einsum.default,forward,8,2,2,1,907,4997,5
-1157,einsum_default_59,call_function,einsum.default,forward,8,2,2,1,953,4984,5
-1158,add_41,call_function,add.Tensor,forward,8,2,2,1,954,4983,10
-1168,mul_60,call_function,mul.Tensor,forward,8,2,2,1,963,4970,8
-1170,mul_61,call_function,mul.Tensor,forward,8,2,2,1,967,4969,8
-1176,einsum_default_60,call_function,einsum.default,forward,8,2,2,1,974,4964,5
-1183,div_8,call_function,div.Tensor,forward,8,2,2,1,981,4946,6
-1188,einsum_default_61,call_function,einsum.default,forward,8,2,2,1,974,4945,5
-1191,mul_62,call_function,mul.Tensor,forward,8,2,2,1,990,4943,8
-1196,einsum_default_62,call_function,einsum.default,forward,8,2,2,1,996,4941,5
-1197,add_44,call_function,add.Tensor,forward,8,2,2,1,997,4940,10
-1207,mul_63,call_function,mul.Tensor,forward,9,2,2,1,1006,4927,8
-1209,mul_64,call_function,mul.Tensor,forward,9,2,2,1,1010,4926,8
-1215,einsum_default_63,call_function,einsum.default,forward,9,2,2,1,1017,4908,5
-1219,einsum_default_64,call_function,einsum.default,forward,9,2,2,1,1017,4908,5
-1235,mul_65,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8
-1238,mul_66,call_function,mul.Tensor,forward,9,2,2,1,1024,4903,8
-1223,einsum_default_65,call_function,einsum.default,forward,9,2,2,1,1017,4901,5
-1261,einsum_default_66,call_function,einsum.default,forward,9,2,2,1,1063,4888,5
-1262,add_46,call_function,add.Tensor,forward,9,2,2,1,1064,4887,10
-1272,mul_67,call_function,mul.Tensor,forward,9,2,2,1,1073,4874,8
-1274,mul_68,call_function,mul.Tensor,forward,9,2,2,1,1077,4873,8
-1280,einsum_default_67,call_function,einsum.default,forward,9,2,2,1,1084,4868,5
-1287,div_9,call_function,div.Tensor,forward,9,2,2,1,1091,4850,6
-1292,einsum_default_68,call_function,einsum.default,forward,9,2,2,1,1084,4849,5
-1295,mul_69,call_function,mul.Tensor,forward,9,2,2,1,1100,4847,8
-1300,einsum_default_69,call_function,einsum.default,forward,9,2,2,1,1106,4845,5
-1301,add_49,call_function,add.Tensor,forward,9,2,2,1,1107,4844,10
-1311,mul_70,call_function,mul.Tensor,forward,10,2,2,1,1116,4831,8
-1313,mul_71,call_function,mul.Tensor,forward,10,2,2,1,1120,4830,8
-1319,einsum_default_70,call_function,einsum.default,forward,10,2,2,1,1127,4812,5
-1323,einsum_default_71,call_function,einsum.default,forward,10,2,2,1,1127,4812,5
-1339,mul_72,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8
-1342,mul_73,call_function,mul.Tensor,forward,10,2,2,1,1134,4807,8
-1327,einsum_default_72,call_function,einsum.default,forward,10,2,2,1,1127,4805,5
-1365,einsum_default_73,call_function,einsum.default,forward,10,2,2,1,1173,4792,5
-1366,add_51,call_function,add.Tensor,forward,10,2,2,1,1174,4791,10
-1376,mul_74,call_function,mul.Tensor,forward,10,2,2,1,1183,4778,8
-1378,mul_75,call_function,mul.Tensor,forward,10,2,2,1,1187,4777,8
-1384,einsum_default_74,call_function,einsum.default,forward,10,2,2,1,1194,4772,5
-1391,div_10,call_function,div.Tensor,forward,10,2,2,1,1201,4754,6
-1396,einsum_default_75,call_function,einsum.default,forward,10,2,2,1,1194,4753,5
-1399,mul_76,call_function,mul.Tensor,forward,10,2,2,1,1210,4751,8
-1404,einsum_default_76,call_function,einsum.default,forward,10,2,2,1,1216,4749,5
-1405,add_54,call_function,add.Tensor,forward,10,2,2,1,1217,4748,10
-1415,mul_77,call_function,mul.Tensor,forward,11,2,2,1,1226,4735,8
-1417,mul_78,call_function,mul.Tensor,forward,11,2,2,1,1230,4734,8
-1423,einsum_default_77,call_function,einsum.default,forward,11,2,2,1,1237,4716,5
-1427,einsum_default_78,call_function,einsum.default,forward,11,2,2,1,1237,4716,5
-1443,mul_79,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8
-1446,mul_80,call_function,mul.Tensor,forward,11,2,2,1,1244,4711,8
-1431,einsum_default_79,call_function,einsum.default,forward,11,2,2,1,1237,4709,5
-1469,einsum_default_80,call_function,einsum.default,forward,11,2,2,1,1283,4696,5
-1470,add_56,call_function,add.Tensor,forward,11,2,2,1,1284,4695,10
-1480,mul_81,call_function,mul.Tensor,forward,11,2,2,1,1293,4682,8
-1482,mul_82,call_function,mul.Tensor,forward,11,2,2,1,1297,4681,8
-1488,einsum_default_81,call_function,einsum.default,forward,11,2,2,1,1304,4676,5
-1495,div_11,call_function,div.Tensor,forward,11,2,2,1,1311,4658,6
-1500,einsum_default_82,call_function,einsum.default,forward,11,2,2,1,1304,4657,5
-1503,mul_83,call_function,mul.Tensor,forward,11,2,2,1,1320,4655,8
-1508,einsum_default_83,call_function,einsum.default,forward,11,2,2,1,1326,4653,5
-1509,add_59,call_function,add.Tensor,forward,11,2,2,1,1327,4652,10
-1519,mul_84,call_function,mul.Tensor,forward,12,2,2,1,1336,4639,8
-1521,mul_85,call_function,mul.Tensor,forward,12,2,2,1,1340,4638,8
-1527,einsum_default_84,call_function,einsum.default,forward,12,2,2,1,1347,4620,5
-1531,einsum_default_85,call_function,einsum.default,forward,12,2,2,1,1347,4620,5
-1547,mul_86,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8
-1550,mul_87,call_function,mul.Tensor,forward,12,2,2,1,1354,4615,8
-1535,einsum_default_86,call_function,einsum.default,forward,12,2,2,1,1347,4613,5
-1573,einsum_default_87,call_function,einsum.default,forward,12,2,2,1,1393,4600,5
-1574,add_61,call_function,add.Tensor,forward,12,2,2,1,1394,4599,10
-1584,mul_88,call_function,mul.Tensor,forward,12,2,2,1,1403,4586,8
-1586,mul_89,call_function,mul.Tensor,forward,12,2,2,1,1407,4585,8
-1592,einsum_default_88,call_function,einsum.default,forward,12,2,2,1,1414,4580,5
-1599,div_12,call_function,div.Tensor,forward,12,2,2,1,1421,4562,6
-1604,einsum_default_89,call_function,einsum.default,forward,12,2,2,1,1414,4561,5
-1607,mul_90,call_function,mul.Tensor,forward,12,2,2,1,1430,4559,8
-1612,einsum_default_90,call_function,einsum.default,forward,12,2,2,1,1436,4557,5
-1613,add_64,call_function,add.Tensor,forward,12,2,2,1,1437,4556,10
-1623,mul_91,call_function,mul.Tensor,forward,13,2,2,1,1446,4543,8
-1625,mul_92,call_function,mul.Tensor,forward,13,2,2,1,1450,4542,8
-1631,einsum_default_91,call_function,einsum.default,forward,13,2,2,1,1457,4524,5
-1635,einsum_default_92,call_function,einsum.default,forward,13,2,2,1,1457,4524,5
-1651,mul_93,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8
-1654,mul_94,call_function,mul.Tensor,forward,13,2,2,1,1464,4519,8
-1639,einsum_default_93,call_function,einsum.default,forward,13,2,2,1,1457,4517,5
-1677,einsum_default_94,call_function,einsum.default,forward,13,2,2,1,1503,4504,5
-1678,add_66,call_function,add.Tensor,forward,13,2,2,1,1504,4503,10
-1688,mul_95,call_function,mul.Tensor,forward,13,2,2,1,1513,4490,8
-1690,mul_96,call_function,mul.Tensor,forward,13,2,2,1,1517,4489,8
-1696,einsum_default_95,call_function,einsum.default,forward,13,2,2,1,1524,4484,5
-1703,div_13,call_function,div.Tensor,forward,13,2,2,1,1531,4466,6
-1708,einsum_default_96,call_function,einsum.default,forward,13,2,2,1,1524,4465,5
-1711,mul_97,call_function,mul.Tensor,forward,13,2,2,1,1540,4463,8
-1716,einsum_default_97,call_function,einsum.default,forward,13,2,2,1,1546,4461,5
-1717,add_69,call_function,add.Tensor,forward,13,2,2,1,1547,4460,10
-1727,mul_98,call_function,mul.Tensor,forward,14,2,2,1,1556,4447,8
-1729,mul_99,call_function,mul.Tensor,forward,14,2,2,1,1560,4446,8
-1735,einsum_default_98,call_function,einsum.default,forward,14,2,2,1,1567,4428,5
-1739,einsum_default_99,call_function,einsum.default,forward,14,2,2,1,1567,4428,5
-1755,mul_100,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8
-1758,mul_101,call_function,mul.Tensor,forward,14,2,2,1,1574,4423,8
-1743,einsum_default_100,call_function,einsum.default,forward,14,2,2,1,1567,4421,5
-1781,einsum_default_101,call_function,einsum.default,forward,14,2,2,1,1613,4408,5
-1782,add_71,call_function,add.Tensor,forward,14,2,2,1,1614,4407,10
-1792,mul_102,call_function,mul.Tensor,forward,14,2,2,1,1623,4394,8
-1794,mul_103,call_function,mul.Tensor,forward,14,2,2,1,1627,4393,8
-1800,einsum_default_102,call_function,einsum.default,forward,14,2,2,1,1634,4388,5
-1807,div_14,call_function,div.Tensor,forward,14,2,2,1,1641,4370,6
-1812,einsum_default_103,call_function,einsum.default,forward,14,2,2,1,1634,4369,5
-1815,mul_104,call_function,mul.Tensor,forward,14,2,2,1,1650,4367,8
-1820,einsum_default_104,call_function,einsum.default,forward,14,2,2,1,1656,4365,5
-1821,add_74,call_function,add.Tensor,forward,14,2,2,1,1657,4364,10
-1831,mul_105,call_function,mul.Tensor,forward,15,2,2,1,1666,4351,8
-1833,mul_106,call_function,mul.Tensor,forward,15,2,2,1,1670,4350,8
-1839,einsum_default_105,call_function,einsum.default,forward,15,2,2,1,1677,4332,5
-1843,einsum_default_106,call_function,einsum.default,forward,15,2,2,1,1677,4332,5
-1859,mul_107,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8
-1862,mul_108,call_function,mul.Tensor,forward,15,2,2,1,1684,4327,8
-1847,einsum_default_107,call_function,einsum.default,forward,15,2,2,1,1677,4325,5
-1885,einsum_default_108,call_function,einsum.default,forward,15,2,2,1,1723,4312,5
-1886,add_76,call_function,add.Tensor,forward,15,2,2,1,1724,4311,10
-1896,mul_109,call_function,mul.Tensor,forward,15,2,2,1,1733,4298,8
-1898,mul_110,call_function,mul.Tensor,forward,15,2,2,1,1737,4297,8
-1904,einsum_default_109,call_function,einsum.default,forward,15,2,2,1,1744,4292,5
-1911,div_15,call_function,div.Tensor,forward,15,2,2,1,1751,4274,6
-1916,einsum_default_110,call_function,einsum.default,forward,15,2,2,1,1744,4273,5
-1919,mul_111,call_function,mul.Tensor,forward,15,2,2,1,1760,4271,8
-1924,einsum_default_111,call_function,einsum.default,forward,15,2,2,1,1766,4269,5
-1925,add_79,call_function,add.Tensor,forward,15,2,2,1,1767,4268,10
-1935,mul_112,call_function,mul.Tensor,forward,16,2,2,1,1776,4255,8
-1937,mul_113,call_function,mul.Tensor,forward,16,2,2,1,1780,4254,8
-1943,einsum_default_112,call_function,einsum.default,forward,16,2,2,1,1787,4236,5
-1947,einsum_default_113,call_function,einsum.default,forward,16,2,2,1,1787,4236,5
-1963,mul_114,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8
-1966,mul_115,call_function,mul.Tensor,forward,16,2,2,1,1794,4231,8
-1951,einsum_default_114,call_function,einsum.default,forward,16,2,2,1,1787,4229,5
-1989,einsum_default_115,call_function,einsum.default,forward,16,2,2,1,1833,4216,5
-1990,add_81,call_function,add.Tensor,forward,16,2,2,1,1834,4215,10
-2000,mul_116,call_function,mul.Tensor,forward,16,2,2,1,1843,4202,8
-2002,mul_117,call_function,mul.Tensor,forward,16,2,2,1,1847,4201,8
-2008,einsum_default_116,call_function,einsum.default,forward,16,2,2,1,1854,4196,5
-2015,div_16,call_function,div.Tensor,forward,16,2,2,1,1861,4178,6
-2020,einsum_default_117,call_function,einsum.default,forward,16,2,2,1,1854,4177,5
-2023,mul_118,call_function,mul.Tensor,forward,16,2,2,1,1870,4175,8
-2028,einsum_default_118,call_function,einsum.default,forward,16,2,2,1,1876,4173,5
-2029,add_84,call_function,add.Tensor,forward,16,2,2,1,1877,4172,10
-2039,mul_119,call_function,mul.Tensor,forward,17,2,2,1,1886,4159,8
-2041,mul_120,call_function,mul.Tensor,forward,17,2,2,1,1890,4158,8
-2047,einsum_default_119,call_function,einsum.default,forward,17,2,2,1,1897,4140,5
-2051,einsum_default_120,call_function,einsum.default,forward,17,2,2,1,1897,4140,5
-2067,mul_121,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8
-2070,mul_122,call_function,mul.Tensor,forward,17,2,2,1,1904,4135,8
-2055,einsum_default_121,call_function,einsum.default,forward,17,2,2,1,1897,4133,5
-2093,einsum_default_122,call_function,einsum.default,forward,17,2,2,1,1943,4120,5
-2094,add_86,call_function,add.Tensor,forward,17,2,2,1,1944,4119,10
-2104,mul_123,call_function,mul.Tensor,forward,17,2,2,1,1953,4106,8
-2106,mul_124,call_function,mul.Tensor,forward,17,2,2,1,1957,4105,8
-2112,einsum_default_123,call_function,einsum.default,forward,17,2,2,1,1964,4100,5
-2119,div_17,call_function,div.Tensor,forward,17,2,2,1,1971,4082,6
-2124,einsum_default_124,call_function,einsum.default,forward,17,2,2,1,1964,4081,5
-2127,mul_125,call_function,mul.Tensor,forward,17,2,2,1,1980,4079,8
-2132,einsum_default_125,call_function,einsum.default,forward,17,2,2,1,1986,4077,5
-2133,add_89,call_function,add.Tensor,forward,17,2,2,1,1987,4076,10
-2143,mul_126,call_function,mul.Tensor,forward,18,2,2,1,1996,4063,8
-2145,mul_127,call_function,mul.Tensor,forward,18,2,2,1,2000,4062,8
-2151,einsum_default_126,call_function,einsum.default,forward,18,2,2,1,2007,4044,5
-2155,einsum_default_127,call_function,einsum.default,forward,18,2,2,1,2007,4044,5
-2171,mul_128,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8
-2174,mul_129,call_function,mul.Tensor,forward,18,2,2,1,2014,4039,8
-2159,einsum_default_128,call_function,einsum.default,forward,18,2,2,1,2007,4037,5
-2197,einsum_default_129,call_function,einsum.default,forward,18,2,2,1,2053,4024,5
-2198,add_91,call_function,add.Tensor,forward,18,2,2,1,2054,4023,10
-2208,mul_130,call_function,mul.Tensor,forward,18,2,2,1,2063,4010,8
-2210,mul_131,call_function,mul.Tensor,forward,18,2,2,1,2067,4009,8
-2216,einsum_default_130,call_function,einsum.default,forward,18,2,2,1,2074,4004,5
-2223,div_18,call_function,div.Tensor,forward,18,2,2,1,2081,3986,6
-2228,einsum_default_131,call_function,einsum.default,forward,18,2,2,1,2074,3985,5
-2231,mul_132,call_function,mul.Tensor,forward,18,2,2,1,2090,3983,8
-2236,einsum_default_132,call_function,einsum.default,forward,18,2,2,1,2096,3981,5
-2237,add_94,call_function,add.Tensor,forward,18,2,2,1,2097,3980,10
-2247,mul_133,call_function,mul.Tensor,forward,19,2,2,1,2106,3967,8
-2249,mul_134,call_function,mul.Tensor,forward,19,2,2,1,2110,3966,8
-2255,einsum_default_133,call_function,einsum.default,forward,19,2,2,1,2117,3948,5
-2259,einsum_default_134,call_function,einsum.default,forward,19,2,2,1,2117,3948,5
-2275,mul_135,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8
-2278,mul_136,call_function,mul.Tensor,forward,19,2,2,1,2124,3943,8
-2263,einsum_default_135,call_function,einsum.default,forward,19,2,2,1,2117,3941,5
-2301,einsum_default_136,call_function,einsum.default,forward,19,2,2,1,2163,3928,5
-2302,add_96,call_function,add.Tensor,forward,19,2,2,1,2164,3927,10
-2312,mul_137,call_function,mul.Tensor,forward,19,2,2,1,2173,3914,8
-2314,mul_138,call_function,mul.Tensor,forward,19,2,2,1,2177,3913,8
-2320,einsum_default_137,call_function,einsum.default,forward,19,2,2,1,2184,3908,5
-2327,div_19,call_function,div.Tensor,forward,19,2,2,1,2191,3890,6
-2332,einsum_default_138,call_function,einsum.default,forward,19,2,2,1,2184,3889,5
-2335,mul_139,call_function,mul.Tensor,forward,19,2,2,1,2200,3887,8
-2340,einsum_default_139,call_function,einsum.default,forward,19,2,2,1,2206,3885,5
-2341,add_99,call_function,add.Tensor,forward,19,2,2,1,2207,3884,10
-2351,mul_140,call_function,mul.Tensor,forward,20,2,2,1,2216,3871,8
-2353,mul_141,call_function,mul.Tensor,forward,20,2,2,1,2220,3870,8
-2359,einsum_default_140,call_function,einsum.default,forward,20,2,2,1,2227,3852,5
-2363,einsum_default_141,call_function,einsum.default,forward,20,2,2,1,2227,3852,5
-2379,mul_142,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8
-2382,mul_143,call_function,mul.Tensor,forward,20,2,2,1,2234,3847,8
-2367,einsum_default_142,call_function,einsum.default,forward,20,2,2,1,2227,3845,5
-2405,einsum_default_143,call_function,einsum.default,forward,20,2,2,1,2273,3832,5
-2406,add_101,call_function,add.Tensor,forward,20,2,2,1,2274,3831,10
-2416,mul_144,call_function,mul.Tensor,forward,20,2,2,1,2283,3818,8
-2418,mul_145,call_function,mul.Tensor,forward,20,2,2,1,2287,3817,8
-2424,einsum_default_144,call_function,einsum.default,forward,20,2,2,1,2294,3812,5
-2431,div_20,call_function,div.Tensor,forward,20,2,2,1,2301,3794,6
-2436,einsum_default_145,call_function,einsum.default,forward,20,2,2,1,2294,3793,5
-2439,mul_146,call_function,mul.Tensor,forward,20,2,2,1,2310,3791,8
-2444,einsum_default_146,call_function,einsum.default,forward,20,2,2,1,2316,3789,5
-2445,add_104,call_function,add.Tensor,forward,20,2,2,1,2317,3788,10
-2455,mul_147,call_function,mul.Tensor,forward,21,2,2,1,2326,3775,8
-2457,mul_148,call_function,mul.Tensor,forward,21,2,2,1,2330,3774,8
-2463,einsum_default_147,call_function,einsum.default,forward,21,2,2,1,2337,3756,5
-2467,einsum_default_148,call_function,einsum.default,forward,21,2,2,1,2337,3756,5
-2483,mul_149,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8
-2486,mul_150,call_function,mul.Tensor,forward,21,2,2,1,2344,3751,8
-2471,einsum_default_149,call_function,einsum.default,forward,21,2,2,1,2337,3749,5
-2509,einsum_default_150,call_function,einsum.default,forward,21,2,2,1,2383,3736,5
-2510,add_106,call_function,add.Tensor,forward,21,2,2,1,2384,3735,10
-2520,mul_151,call_function,mul.Tensor,forward,21,2,2,1,2393,3722,8
-2522,mul_152,call_function,mul.Tensor,forward,21,2,2,1,2397,3721,8
-2528,einsum_default_151,call_function,einsum.default,forward,21,2,2,1,2404,3716,5
-2535,div_21,call_function,div.Tensor,forward,21,2,2,1,2411,3698,6
-2540,einsum_default_152,call_function,einsum.default,forward,21,2,2,1,2404,3697,5
-2543,mul_153,call_function,mul.Tensor,forward,21,2,2,1,2420,3695,8
-2548,einsum_default_153,call_function,einsum.default,forward,21,2,2,1,2426,3693,5
-2549,add_109,call_function,add.Tensor,forward,21,2,2,1,2427,3692,10
-2559,mul_154,call_function,mul.Tensor,forward,22,2,2,1,2436,3679,8
-2561,mul_155,call_function,mul.Tensor,forward,22,2,2,1,2440,3678,8
-2567,einsum_default_154,call_function,einsum.default,forward,22,2,2,1,2447,3660,5
-2571,einsum_default_155,call_function,einsum.default,forward,22,2,2,1,2447,3660,5
-2587,mul_156,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8
-2590,mul_157,call_function,mul.Tensor,forward,22,2,2,1,2454,3655,8
-2575,einsum_default_156,call_function,einsum.default,forward,22,2,2,1,2447,3653,5
-2613,einsum_default_157,call_function,einsum.default,forward,22,2,2,1,2493,3640,5
-2614,add_111,call_function,add.Tensor,forward,22,2,2,1,2494,3639,10
-2624,mul_158,call_function,mul.Tensor,forward,22,2,2,1,2503,3626,8
-2626,mul_159,call_function,mul.Tensor,forward,22,2,2,1,2507,3625,8
-2632,einsum_default_158,call_function,einsum.default,forward,22,2,2,1,2514,3620,5
-2639,div_22,call_function,div.Tensor,forward,22,2,2,1,2521,3602,6
-2644,einsum_default_159,call_function,einsum.default,forward,22,2,2,1,2514,3601,5
-2647,mul_160,call_function,mul.Tensor,forward,22,2,2,1,2530,3599,8
-2652,einsum_default_160,call_function,einsum.default,forward,22,2,2,1,2536,3597,5
-2653,add_114,call_function,add.Tensor,forward,22,2,2,1,2537,3596,10
-2663,mul_161,call_function,mul.Tensor,forward,23,2,2,1,2546,3583,8
-2665,mul_162,call_function,mul.Tensor,forward,23,2,2,1,2550,3582,8
-2671,einsum_default_161,call_function,einsum.default,forward,23,2,2,1,2557,3564,5
-2675,einsum_default_162,call_function,einsum.default,forward,23,2,2,1,2557,3564,5
-2691,mul_163,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8
-2694,mul_164,call_function,mul.Tensor,forward,23,2,2,1,2564,3559,8
-2679,einsum_default_163,call_function,einsum.default,forward,23,2,2,1,2557,3557,5
-2717,einsum_default_164,call_function,einsum.default,forward,23,2,2,1,2603,3544,5
-2718,add_116,call_function,add.Tensor,forward,23,2,2,1,2604,3543,10
-2728,mul_165,call_function,mul.Tensor,forward,23,2,2,1,2613,3530,8
-2730,mul_166,call_function,mul.Tensor,forward,23,2,2,1,2617,3529,8
-2736,einsum_default_165,call_function,einsum.default,forward,23,2,2,1,2624,3524,5
-2743,div_23,call_function,div.Tensor,forward,23,2,2,1,2631,3506,6
-2748,einsum_default_166,call_function,einsum.default,forward,23,2,2,1,2624,3505,5
-2751,mul_167,call_function,mul.Tensor,forward,23,2,2,1,2640,3503,8
-2756,einsum_default_167,call_function,einsum.default,forward,23,2,2,1,2646,3501,5
-2757,add_119,call_function,add.Tensor,forward,23,2,2,1,2647,3500,10
-2767,mul_168,call_function,mul.Tensor,forward,24,2,2,1,2656,3487,8
-2769,mul_169,call_function,mul.Tensor,forward,24,2,2,1,2660,3486,8
-2775,einsum_default_168,call_function,einsum.default,forward,24,2,2,1,2667,3468,5
-2779,einsum_default_169,call_function,einsum.default,forward,24,2,2,1,2667,3468,5
-2795,mul_170,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8
-2798,mul_171,call_function,mul.Tensor,forward,24,2,2,1,2674,3463,8
-2783,einsum_default_170,call_function,einsum.default,forward,24,2,2,1,2667,3461,5
-2821,einsum_default_171,call_function,einsum.default,forward,24,2,2,1,2713,3448,5
-2822,add_121,call_function,add.Tensor,forward,24,2,2,1,2714,3447,10
-2832,mul_172,call_function,mul.Tensor,forward,24,2,2,1,2723,3434,8
-2834,mul_173,call_function,mul.Tensor,forward,24,2,2,1,2727,3433,8
-2840,einsum_default_172,call_function,einsum.default,forward,24,2,2,1,2734,3428,5
-2847,div_24,call_function,div.Tensor,forward,24,2,2,1,2741,3410,6
-2852,einsum_default_173,call_function,einsum.default,forward,24,2,2,1,2734,3409,5
-2855,mul_174,call_function,mul.Tensor,forward,24,2,2,1,2750,3407,8
-2860,einsum_default_174,call_function,einsum.default,forward,24,2,2,1,2756,3405,5
-2861,add_124,call_function,add.Tensor,forward,24,2,2,1,2757,3404,10
-2871,mul_175,call_function,mul.Tensor,forward,25,2,2,1,2766,3391,8
-2873,mul_176,call_function,mul.Tensor,forward,25,2,2,1,2770,3390,8
-2879,einsum_default_175,call_function,einsum.default,forward,25,2,2,1,2777,3372,5
-2883,einsum_default_176,call_function,einsum.default,forward,25,2,2,1,2777,3372,5
-2899,mul_177,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8
-2902,mul_178,call_function,mul.Tensor,forward,25,2,2,1,2784,3367,8
-2887,einsum_default_177,call_function,einsum.default,forward,25,2,2,1,2777,3365,5
-2925,einsum_default_178,call_function,einsum.default,forward,25,2,2,1,2823,3352,5
-2926,add_126,call_function,add.Tensor,forward,25,2,2,1,2824,3351,10
-2936,mul_179,call_function,mul.Tensor,forward,25,2,2,1,2833,3338,8
-2938,mul_180,call_function,mul.Tensor,forward,25,2,2,1,2837,3337,8
-2944,einsum_default_179,call_function,einsum.default,forward,25,2,2,1,2844,3332,5
-2951,div_25,call_function,div.Tensor,forward,25,2,2,1,2851,3314,6
-2956,einsum_default_180,call_function,einsum.default,forward,25,2,2,1,2844,3313,5
-2959,mul_181,call_function,mul.Tensor,forward,25,2,2,1,2860,3311,8
-2964,einsum_default_181,call_function,einsum.default,forward,25,2,2,1,2866,3309,5
-2965,add_129,call_function,add.Tensor,forward,25,2,2,1,2867,3308,10
-2975,mul_182,call_function,mul.Tensor,forward,26,2,2,1,2876,3295,8
-2977,mul_183,call_function,mul.Tensor,forward,26,2,2,1,2880,3294,8
-2983,einsum_default_182,call_function,einsum.default,forward,26,2,2,1,2887,3276,5
-2987,einsum_default_183,call_function,einsum.default,forward,26,2,2,1,2887,3276,5
-3003,mul_184,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8
-3006,mul_185,call_function,mul.Tensor,forward,26,2,2,1,2894,3271,8
-2991,einsum_default_184,call_function,einsum.default,forward,26,2,2,1,2887,3269,5
-3029,einsum_default_185,call_function,einsum.default,forward,26,2,2,1,2933,3256,5
-3030,add_131,call_function,add.Tensor,forward,26,2,2,1,2934,3255,10
-3040,mul_186,call_function,mul.Tensor,forward,26,2,2,1,2943,3242,8
-3042,mul_187,call_function,mul.Tensor,forward,26,2,2,1,2947,3241,8
-3048,einsum_default_186,call_function,einsum.default,forward,26,2,2,1,2954,3236,5
-3055,div_26,call_function,div.Tensor,forward,26,2,2,1,2961,3218,6
-3060,einsum_default_187,call_function,einsum.default,forward,26,2,2,1,2954,3217,5
-3063,mul_188,call_function,mul.Tensor,forward,26,2,2,1,2970,3215,8
-3068,einsum_default_188,call_function,einsum.default,forward,26,2,2,1,2976,3213,5
-3069,add_134,call_function,add.Tensor,forward,26,2,2,1,2977,3212,10
-3079,mul_189,call_function,mul.Tensor,forward,27,2,2,1,2986,3199,8
-3081,mul_190,call_function,mul.Tensor,forward,27,2,2,1,2990,3198,8
-3087,einsum_default_189,call_function,einsum.default,forward,27,2,2,1,2997,3180,5
-3091,einsum_default_190,call_function,einsum.default,forward,27,2,2,1,2997,3180,5
-3107,mul_191,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8
-3110,mul_192,call_function,mul.Tensor,forward,27,2,2,1,3004,3175,8
-3095,einsum_default_191,call_function,einsum.default,forward,27,2,2,1,2997,3173,5
-3133,einsum_default_192,call_function,einsum.default,forward,27,2,2,1,3043,3160,5
-3134,add_136,call_function,add.Tensor,forward,27,2,2,1,3044,3159,10
-3144,mul_193,call_function,mul.Tensor,forward,27,2,2,1,3053,3146,8
-3146,mul_194,call_function,mul.Tensor,forward,27,2,2,1,3057,3145,8
-3152,einsum_default_193,call_function,einsum.default,forward,27,2,2,1,3064,3140,5
-3159,div_27,call_function,div.Tensor,forward,27,2,2,1,3071,3122,6
-3164,einsum_default_194,call_function,einsum.default,forward,27,2,2,1,3064,3121,5
-3167,mul_195,call_function,mul.Tensor,forward,27,2,2,1,3080,3119,8
-3172,einsum_default_195,call_function,einsum.default,forward,27,2,2,1,3086,3117,5
-3173,add_139,call_function,add.Tensor,forward,27,2,2,1,3087,3116,10
-3196,einsum_default_198,call_function,einsum.default,backward,,2,2,1,8,3099,5
-3204,mul_199,call_function,mul.Tensor,backward,,2,2,1,3097,3097,8
-3203,mul_198,call_function,mul.Tensor,backward,,2,2,1,15,3091,8
-3207,mul_200,call_function,mul.Tensor,backward,,2,2,1,3114,3089,8
-3210,mul_201,call_function,mul.Tensor,backward,,2,2,1,3117,3087,8
-3211,sub,call_function,sub.Tensor,backward,,2,2,1,3118,3086,10
-3212,mul_202,call_function,mul.Tensor,backward,,2,2,1,3119,3085,8
-3222,einsum_default_200,call_function,einsum.default,backward,27,2,2,1,3123,3078,5
-3228,mul_205,call_function,mul.Tensor,backward,27,2,2,1,3125,3069,8
-3247,mul_208,call_function,mul.Tensor,backward,27,2,2,1,3075,3068,8
-3245,mul_207,call_function,mul.Tensor,backward,27,2,2,1,3135,3067,8
-3249,mul_209,call_function,mul.Tensor,backward,27,2,2,1,3139,3066,8
-3227,mul_204,call_function,mul.Tensor,backward,27,2,2,1,3125,3065,8
-3232,einsum_default_202,call_function,einsum.default,backward,27,2,2,1,3128,3059,5
-3254,einsum_default_204,call_function,einsum.default,backward,27,2,2,1,3143,3059,5
-3255,add_143,call_function,add.Tensor,unknown,,2,2,1,3148,3058,10
-3264,mul_211,call_function,mul.Tensor,backward,27,2,2,1,3054,3056,8
-3263,mul_210,call_function,mul.Tensor,backward,27,2,2,1,3152,3050,8
-3267,mul_212,call_function,mul.Tensor,backward,27,2,2,1,3157,3048,8
-3270,mul_213,call_function,mul.Tensor,backward,27,2,2,1,3160,3046,8
-3271,sub_2,call_function,sub.Tensor,backward,27,2,2,1,3161,3045,10
-3272,mul_214,call_function,mul.Tensor,backward,27,2,2,1,3162,3044,8
-3277,add_144,call_function,add.Tensor,unknown,,2,2,1,3164,3042,10
-3283,einsum_default_206,call_function,einsum.default,backward,27,2,2,1,3167,3036,5
-3302,mul_216,call_function,mul.Tensor,backward,27,2,2,1,3181,3002,8
-3307,mul_217,call_function,mul.Tensor,backward,27,2,2,1,3181,3001,8
-3320,einsum_default_208,call_function,einsum.default,backward,27,2,2,1,3179,2992,5
-3327,einsum_default_210,call_function,einsum.default,backward,27,2,2,1,3188,2992,5
-3328,add_145,call_function,add.Tensor,unknown,,2,2,1,3195,2991,10
-3335,einsum_default_212,call_function,einsum.default,backward,27,2,2,1,3188,2991,5
-3336,add_146,call_function,add.Tensor,unknown,,2,2,1,3211,2990,10
-3345,mul_219,call_function,mul.Tensor,backward,27,2,2,1,2987,2988,8
-3344,mul_218,call_function,mul.Tensor,backward,27,2,2,1,3215,2982,8
-3348,mul_220,call_function,mul.Tensor,backward,27,2,2,1,3220,2980,8
-3351,mul_221,call_function,mul.Tensor,backward,27,2,2,1,3223,2978,8
-3352,sub_3,call_function,sub.Tensor,backward,27,2,2,1,3224,2977,10
-3353,mul_222,call_function,mul.Tensor,backward,27,2,2,1,3225,2976,8
-3358,add_147,call_function,add.Tensor,unknown,,2,2,1,3227,2974,10
-3364,einsum_default_214,call_function,einsum.default,backward,26,2,2,1,3230,2968,5
-3370,mul_225,call_function,mul.Tensor,backward,26,2,2,1,3232,2959,8
-3389,mul_228,call_function,mul.Tensor,backward,26,2,2,1,2965,2958,8
-3387,mul_227,call_function,mul.Tensor,backward,26,2,2,1,3242,2957,8
-3391,mul_229,call_function,mul.Tensor,backward,26,2,2,1,3246,2956,8
-3369,mul_224,call_function,mul.Tensor,backward,26,2,2,1,3232,2955,8
-3374,einsum_default_216,call_function,einsum.default,backward,26,2,2,1,3235,2949,5
-3396,einsum_default_218,call_function,einsum.default,backward,26,2,2,1,3250,2949,5
-3397,add_150,call_function,add.Tensor,unknown,,2,2,1,3255,2948,10
-3406,mul_231,call_function,mul.Tensor,backward,26,2,2,1,2944,2946,8
-3405,mul_230,call_function,mul.Tensor,backward,26,2,2,1,3259,2940,8
-3409,mul_232,call_function,mul.Tensor,backward,26,2,2,1,3264,2938,8
-3412,mul_233,call_function,mul.Tensor,backward,26,2,2,1,3267,2936,8
-3413,sub_5,call_function,sub.Tensor,backward,26,2,2,1,3268,2935,10
-3414,mul_234,call_function,mul.Tensor,backward,26,2,2,1,3269,2934,8
-3419,add_151,call_function,add.Tensor,unknown,,2,2,1,3271,2932,10
-3425,einsum_default_220,call_function,einsum.default,backward,26,2,2,1,3274,2926,5
-3444,mul_236,call_function,mul.Tensor,backward,26,2,2,1,3288,2892,8
-3449,mul_237,call_function,mul.Tensor,backward,26,2,2,1,3288,2891,8
-3462,einsum_default_222,call_function,einsum.default,backward,26,2,2,1,3286,2882,5
-3469,einsum_default_224,call_function,einsum.default,backward,26,2,2,1,3295,2882,5
-3470,add_152,call_function,add.Tensor,unknown,,2,2,1,3302,2881,10
-3477,einsum_default_226,call_function,einsum.default,backward,26,2,2,1,3295,2881,5
-3478,add_153,call_function,add.Tensor,unknown,,2,2,1,3318,2880,10
-3487,mul_239,call_function,mul.Tensor,backward,26,2,2,1,2877,2878,8
-3486,mul_238,call_function,mul.Tensor,backward,26,2,2,1,3322,2872,8
-3490,mul_240,call_function,mul.Tensor,backward,26,2,2,1,3327,2870,8
-3493,mul_241,call_function,mul.Tensor,backward,26,2,2,1,3330,2868,8
-3494,sub_6,call_function,sub.Tensor,backward,26,2,2,1,3331,2867,10
-3495,mul_242,call_function,mul.Tensor,backward,26,2,2,1,3332,2866,8
-3500,add_154,call_function,add.Tensor,unknown,,2,2,1,3334,2864,10
-3506,einsum_default_228,call_function,einsum.default,backward,25,2,2,1,3337,2858,5
-3512,mul_245,call_function,mul.Tensor,backward,25,2,2,1,3339,2849,8
-3531,mul_248,call_function,mul.Tensor,backward,25,2,2,1,2855,2848,8
-3529,mul_247,call_function,mul.Tensor,backward,25,2,2,1,3349,2847,8
-3533,mul_249,call_function,mul.Tensor,backward,25,2,2,1,3353,2846,8
-3511,mul_244,call_function,mul.Tensor,backward,25,2,2,1,3339,2845,8
-3516,einsum_default_230,call_function,einsum.default,backward,25,2,2,1,3342,2839,5
-3538,einsum_default_232,call_function,einsum.default,backward,25,2,2,1,3357,2839,5
-3539,add_157,call_function,add.Tensor,unknown,,2,2,1,3362,2838,10
-3548,mul_251,call_function,mul.Tensor,backward,25,2,2,1,2834,2836,8
-3547,mul_250,call_function,mul.Tensor,backward,25,2,2,1,3366,2830,8
-3551,mul_252,call_function,mul.Tensor,backward,25,2,2,1,3371,2828,8
-3554,mul_253,call_function,mul.Tensor,backward,25,2,2,1,3374,2826,8
-3555,sub_8,call_function,sub.Tensor,backward,25,2,2,1,3375,2825,10
-3556,mul_254,call_function,mul.Tensor,backward,25,2,2,1,3376,2824,8
-3561,add_158,call_function,add.Tensor,unknown,,2,2,1,3378,2822,10
-3567,einsum_default_234,call_function,einsum.default,backward,25,2,2,1,3381,2816,5
-3586,mul_256,call_function,mul.Tensor,backward,25,2,2,1,3395,2782,8
-3591,mul_257,call_function,mul.Tensor,backward,25,2,2,1,3395,2781,8
-3604,einsum_default_236,call_function,einsum.default,backward,25,2,2,1,3393,2772,5
-3611,einsum_default_238,call_function,einsum.default,backward,25,2,2,1,3402,2772,5
-3612,add_159,call_function,add.Tensor,unknown,,2,2,1,3409,2771,10
-3619,einsum_default_240,call_function,einsum.default,backward,25,2,2,1,3402,2771,5
-3620,add_160,call_function,add.Tensor,unknown,,2,2,1,3425,2770,10
-3629,mul_259,call_function,mul.Tensor,backward,25,2,2,1,2767,2768,8
-3628,mul_258,call_function,mul.Tensor,backward,25,2,2,1,3429,2762,8
-3632,mul_260,call_function,mul.Tensor,backward,25,2,2,1,3434,2760,8
-3635,mul_261,call_function,mul.Tensor,backward,25,2,2,1,3437,2758,8
-3636,sub_9,call_function,sub.Tensor,backward,25,2,2,1,3438,2757,10
-3637,mul_262,call_function,mul.Tensor,backward,25,2,2,1,3439,2756,8
-3642,add_161,call_function,add.Tensor,unknown,,2,2,1,3441,2754,10
-3648,einsum_default_242,call_function,einsum.default,backward,24,2,2,1,3444,2748,5
-3654,mul_265,call_function,mul.Tensor,backward,24,2,2,1,3446,2739,8
-3673,mul_268,call_function,mul.Tensor,backward,24,2,2,1,2745,2738,8
-3671,mul_267,call_function,mul.Tensor,backward,24,2,2,1,3456,2737,8
-3675,mul_269,call_function,mul.Tensor,backward,24,2,2,1,3460,2736,8
-3653,mul_264,call_function,mul.Tensor,backward,24,2,2,1,3446,2735,8
-3658,einsum_default_244,call_function,einsum.default,backward,24,2,2,1,3449,2729,5
-3680,einsum_default_246,call_function,einsum.default,backward,24,2,2,1,3464,2729,5
-3681,add_164,call_function,add.Tensor,unknown,,2,2,1,3469,2728,10
-3690,mul_271,call_function,mul.Tensor,backward,24,2,2,1,2724,2726,8
-3689,mul_270,call_function,mul.Tensor,backward,24,2,2,1,3473,2720,8
-3693,mul_272,call_function,mul.Tensor,backward,24,2,2,1,3478,2718,8
-3696,mul_273,call_function,mul.Tensor,backward,24,2,2,1,3481,2716,8
-3697,sub_11,call_function,sub.Tensor,backward,24,2,2,1,3482,2715,10
-3698,mul_274,call_function,mul.Tensor,backward,24,2,2,1,3483,2714,8
-3703,add_165,call_function,add.Tensor,unknown,,2,2,1,3485,2712,10
-3709,einsum_default_248,call_function,einsum.default,backward,24,2,2,1,3488,2706,5
-3728,mul_276,call_function,mul.Tensor,backward,24,2,2,1,3502,2672,8
-3733,mul_277,call_function,mul.Tensor,backward,24,2,2,1,3502,2671,8
-3746,einsum_default_250,call_function,einsum.default,backward,24,2,2,1,3500,2662,5
-3753,einsum_default_252,call_function,einsum.default,backward,24,2,2,1,3509,2662,5
-3754,add_166,call_function,add.Tensor,unknown,,2,2,1,3516,2661,10
-3761,einsum_default_254,call_function,einsum.default,backward,24,2,2,1,3509,2661,5
-3762,add_167,call_function,add.Tensor,unknown,,2,2,1,3532,2660,10
-3771,mul_279,call_function,mul.Tensor,backward,24,2,2,1,2657,2658,8
-3770,mul_278,call_function,mul.Tensor,backward,24,2,2,1,3536,2652,8
-3774,mul_280,call_function,mul.Tensor,backward,24,2,2,1,3541,2650,8
-3777,mul_281,call_function,mul.Tensor,backward,24,2,2,1,3544,2648,8
-3778,sub_12,call_function,sub.Tensor,backward,24,2,2,1,3545,2647,10
-3779,mul_282,call_function,mul.Tensor,backward,24,2,2,1,3546,2646,8
-3784,add_168,call_function,add.Tensor,unknown,,2,2,1,3548,2644,10
-3790,einsum_default_256,call_function,einsum.default,backward,23,2,2,1,3551,2638,5
-3796,mul_285,call_function,mul.Tensor,backward,23,2,2,1,3553,2629,8
-3815,mul_288,call_function,mul.Tensor,backward,23,2,2,1,2635,2628,8
-3813,mul_287,call_function,mul.Tensor,backward,23,2,2,1,3563,2627,8
-3817,mul_289,call_function,mul.Tensor,backward,23,2,2,1,3567,2626,8
-3795,mul_284,call_function,mul.Tensor,backward,23,2,2,1,3553,2625,8
-3800,einsum_default_258,call_function,einsum.default,backward,23,2,2,1,3556,2619,5
-3822,einsum_default_260,call_function,einsum.default,backward,23,2,2,1,3571,2619,5
-3823,add_171,call_function,add.Tensor,unknown,,2,2,1,3576,2618,10
-3832,mul_291,call_function,mul.Tensor,backward,23,2,2,1,2614,2616,8
-3831,mul_290,call_function,mul.Tensor,backward,23,2,2,1,3580,2610,8
-3835,mul_292,call_function,mul.Tensor,backward,23,2,2,1,3585,2608,8
-3838,mul_293,call_function,mul.Tensor,backward,23,2,2,1,3588,2606,8
-3839,sub_14,call_function,sub.Tensor,backward,23,2,2,1,3589,2605,10
-3840,mul_294,call_function,mul.Tensor,backward,23,2,2,1,3590,2604,8
-3845,add_172,call_function,add.Tensor,unknown,,2,2,1,3592,2602,10
-3851,einsum_default_262,call_function,einsum.default,backward,23,2,2,1,3595,2596,5
-3870,mul_296,call_function,mul.Tensor,backward,23,2,2,1,3609,2562,8
-3875,mul_297,call_function,mul.Tensor,backward,23,2,2,1,3609,2561,8
-3888,einsum_default_264,call_function,einsum.default,backward,23,2,2,1,3607,2552,5
-3895,einsum_default_266,call_function,einsum.default,backward,23,2,2,1,3616,2552,5
-3896,add_173,call_function,add.Tensor,unknown,,2,2,1,3623,2551,10
-3903,einsum_default_268,call_function,einsum.default,backward,23,2,2,1,3616,2551,5
-3904,add_174,call_function,add.Tensor,unknown,,2,2,1,3639,2550,10
-3913,mul_299,call_function,mul.Tensor,backward,23,2,2,1,2547,2548,8
-3912,mul_298,call_function,mul.Tensor,backward,23,2,2,1,3643,2542,8
-3916,mul_300,call_function,mul.Tensor,backward,23,2,2,1,3648,2540,8
-3919,mul_301,call_function,mul.Tensor,backward,23,2,2,1,3651,2538,8
-3920,sub_15,call_function,sub.Tensor,backward,23,2,2,1,3652,2537,10
-3921,mul_302,call_function,mul.Tensor,backward,23,2,2,1,3653,2536,8
-3926,add_175,call_function,add.Tensor,unknown,,2,2,1,3655,2534,10
-3932,einsum_default_270,call_function,einsum.default,backward,22,2,2,1,3658,2528,5
-3938,mul_305,call_function,mul.Tensor,backward,22,2,2,1,3660,2519,8
-3957,mul_308,call_function,mul.Tensor,backward,22,2,2,1,2525,2518,8
-3955,mul_307,call_function,mul.Tensor,backward,22,2,2,1,3670,2517,8
-3959,mul_309,call_function,mul.Tensor,backward,22,2,2,1,3674,2516,8
-3937,mul_304,call_function,mul.Tensor,backward,22,2,2,1,3660,2515,8
-3942,einsum_default_272,call_function,einsum.default,backward,22,2,2,1,3663,2509,5
-3964,einsum_default_274,call_function,einsum.default,backward,22,2,2,1,3678,2509,5
-3965,add_178,call_function,add.Tensor,unknown,,2,2,1,3683,2508,10
-3974,mul_311,call_function,mul.Tensor,backward,22,2,2,1,2504,2506,8
-3973,mul_310,call_function,mul.Tensor,backward,22,2,2,1,3687,2500,8
-3977,mul_312,call_function,mul.Tensor,backward,22,2,2,1,3692,2498,8
-3980,mul_313,call_function,mul.Tensor,backward,22,2,2,1,3695,2496,8
-3981,sub_17,call_function,sub.Tensor,backward,22,2,2,1,3696,2495,10
-3982,mul_314,call_function,mul.Tensor,backward,22,2,2,1,3697,2494,8
-3987,add_179,call_function,add.Tensor,unknown,,2,2,1,3699,2492,10
-3993,einsum_default_276,call_function,einsum.default,backward,22,2,2,1,3702,2486,5
-4012,mul_316,call_function,mul.Tensor,backward,22,2,2,1,3716,2452,8
-4017,mul_317,call_function,mul.Tensor,backward,22,2,2,1,3716,2451,8
-4030,einsum_default_278,call_function,einsum.default,backward,22,2,2,1,3714,2442,5
-4037,einsum_default_280,call_function,einsum.default,backward,22,2,2,1,3723,2442,5
-4038,add_180,call_function,add.Tensor,unknown,,2,2,1,3730,2441,10
-4045,einsum_default_282,call_function,einsum.default,backward,22,2,2,1,3723,2441,5
-4046,add_181,call_function,add.Tensor,unknown,,2,2,1,3746,2440,10
-4055,mul_319,call_function,mul.Tensor,backward,22,2,2,1,2437,2438,8
-4054,mul_318,call_function,mul.Tensor,backward,22,2,2,1,3750,2432,8
-4058,mul_320,call_function,mul.Tensor,backward,22,2,2,1,3755,2430,8
-4061,mul_321,call_function,mul.Tensor,backward,22,2,2,1,3758,2428,8
-4062,sub_18,call_function,sub.Tensor,backward,22,2,2,1,3759,2427,10
-4063,mul_322,call_function,mul.Tensor,backward,22,2,2,1,3760,2426,8
-4068,add_182,call_function,add.Tensor,unknown,,2,2,1,3762,2424,10
-4074,einsum_default_284,call_function,einsum.default,backward,21,2,2,1,3765,2418,5
-4080,mul_325,call_function,mul.Tensor,backward,21,2,2,1,3767,2409,8
-4099,mul_328,call_function,mul.Tensor,backward,21,2,2,1,2415,2408,8
-4097,mul_327,call_function,mul.Tensor,backward,21,2,2,1,3777,2407,8
-4101,mul_329,call_function,mul.Tensor,backward,21,2,2,1,3781,2406,8
-4079,mul_324,call_function,mul.Tensor,backward,21,2,2,1,3767,2405,8
-4084,einsum_default_286,call_function,einsum.default,backward,21,2,2,1,3770,2399,5
-4106,einsum_default_288,call_function,einsum.default,backward,21,2,2,1,3785,2399,5
-4107,add_185,call_function,add.Tensor,unknown,,2,2,1,3790,2398,10
-4116,mul_331,call_function,mul.Tensor,backward,21,2,2,1,2394,2396,8
-4115,mul_330,call_function,mul.Tensor,backward,21,2,2,1,3794,2390,8
-4119,mul_332,call_function,mul.Tensor,backward,21,2,2,1,3799,2388,8
-4122,mul_333,call_function,mul.Tensor,backward,21,2,2,1,3802,2386,8
-4123,sub_20,call_function,sub.Tensor,backward,21,2,2,1,3803,2385,10
-4124,mul_334,call_function,mul.Tensor,backward,21,2,2,1,3804,2384,8
-4129,add_186,call_function,add.Tensor,unknown,,2,2,1,3806,2382,10
-4135,einsum_default_290,call_function,einsum.default,backward,21,2,2,1,3809,2376,5
-4154,mul_336,call_function,mul.Tensor,backward,21,2,2,1,3823,2342,8
-4159,mul_337,call_function,mul.Tensor,backward,21,2,2,1,3823,2341,8
-4172,einsum_default_292,call_function,einsum.default,backward,21,2,2,1,3821,2332,5
-4179,einsum_default_294,call_function,einsum.default,backward,21,2,2,1,3830,2332,5
-4180,add_187,call_function,add.Tensor,unknown,,2,2,1,3837,2331,10
-4187,einsum_default_296,call_function,einsum.default,backward,21,2,2,1,3830,2331,5
-4188,add_188,call_function,add.Tensor,unknown,,2,2,1,3853,2330,10
-4197,mul_339,call_function,mul.Tensor,backward,21,2,2,1,2327,2328,8
-4196,mul_338,call_function,mul.Tensor,backward,21,2,2,1,3857,2322,8
-4200,mul_340,call_function,mul.Tensor,backward,21,2,2,1,3862,2320,8
-4203,mul_341,call_function,mul.Tensor,backward,21,2,2,1,3865,2318,8
-4204,sub_21,call_function,sub.Tensor,backward,21,2,2,1,3866,2317,10
-4205,mul_342,call_function,mul.Tensor,backward,21,2,2,1,3867,2316,8
-4210,add_189,call_function,add.Tensor,unknown,,2,2,1,3869,2314,10
-4216,einsum_default_298,call_function,einsum.default,backward,20,2,2,1,3872,2308,5
-4222,mul_345,call_function,mul.Tensor,backward,20,2,2,1,3874,2299,8
-4241,mul_348,call_function,mul.Tensor,backward,20,2,2,1,2305,2298,8
-4239,mul_347,call_function,mul.Tensor,backward,20,2,2,1,3884,2297,8
-4243,mul_349,call_function,mul.Tensor,backward,20,2,2,1,3888,2296,8
-4221,mul_344,call_function,mul.Tensor,backward,20,2,2,1,3874,2295,8
-4226,einsum_default_300,call_function,einsum.default,backward,20,2,2,1,3877,2289,5
-4248,einsum_default_302,call_function,einsum.default,backward,20,2,2,1,3892,2289,5
-4249,add_192,call_function,add.Tensor,unknown,,2,2,1,3897,2288,10
-4258,mul_351,call_function,mul.Tensor,backward,20,2,2,1,2284,2286,8
-4257,mul_350,call_function,mul.Tensor,backward,20,2,2,1,3901,2280,8
-4261,mul_352,call_function,mul.Tensor,backward,20,2,2,1,3906,2278,8
-4264,mul_353,call_function,mul.Tensor,backward,20,2,2,1,3909,2276,8
-4265,sub_23,call_function,sub.Tensor,backward,20,2,2,1,3910,2275,10
-4266,mul_354,call_function,mul.Tensor,backward,20,2,2,1,3911,2274,8
-4271,add_193,call_function,add.Tensor,unknown,,2,2,1,3913,2272,10
-4277,einsum_default_304,call_function,einsum.default,backward,20,2,2,1,3916,2266,5
-4296,mul_356,call_function,mul.Tensor,backward,20,2,2,1,3930,2232,8
-4301,mul_357,call_function,mul.Tensor,backward,20,2,2,1,3930,2231,8
-4314,einsum_default_306,call_function,einsum.default,backward,20,2,2,1,3928,2222,5
-4321,einsum_default_308,call_function,einsum.default,backward,20,2,2,1,3937,2222,5
-4322,add_194,call_function,add.Tensor,unknown,,2,2,1,3944,2221,10
-4329,einsum_default_310,call_function,einsum.default,backward,20,2,2,1,3937,2221,5
-4330,add_195,call_function,add.Tensor,unknown,,2,2,1,3960,2220,10
-4339,mul_359,call_function,mul.Tensor,backward,20,2,2,1,2217,2218,8
-4338,mul_358,call_function,mul.Tensor,backward,20,2,2,1,3964,2212,8
-4342,mul_360,call_function,mul.Tensor,backward,20,2,2,1,3969,2210,8
-4345,mul_361,call_function,mul.Tensor,backward,20,2,2,1,3972,2208,8
-4346,sub_24,call_function,sub.Tensor,backward,20,2,2,1,3973,2207,10
-4347,mul_362,call_function,mul.Tensor,backward,20,2,2,1,3974,2206,8
-4352,add_196,call_function,add.Tensor,unknown,,2,2,1,3976,2204,10
-4358,einsum_default_312,call_function,einsum.default,backward,19,2,2,1,3979,2198,5
-4364,mul_365,call_function,mul.Tensor,backward,19,2,2,1,3981,2189,8
-4383,mul_368,call_function,mul.Tensor,backward,19,2,2,1,2195,2188,8
-4381,mul_367,call_function,mul.Tensor,backward,19,2,2,1,3991,2187,8
-4385,mul_369,call_function,mul.Tensor,backward,19,2,2,1,3995,2186,8
-4363,mul_364,call_function,mul.Tensor,backward,19,2,2,1,3981,2185,8
-4368,einsum_default_314,call_function,einsum.default,backward,19,2,2,1,3984,2179,5
-4390,einsum_default_316,call_function,einsum.default,backward,19,2,2,1,3999,2179,5
-4391,add_199,call_function,add.Tensor,unknown,,2,2,1,4004,2178,10
-4400,mul_371,call_function,mul.Tensor,backward,19,2,2,1,2174,2176,8
-4399,mul_370,call_function,mul.Tensor,backward,19,2,2,1,4008,2170,8
-4403,mul_372,call_function,mul.Tensor,backward,19,2,2,1,4013,2168,8
-4406,mul_373,call_function,mul.Tensor,backward,19,2,2,1,4016,2166,8
-4407,sub_26,call_function,sub.Tensor,backward,19,2,2,1,4017,2165,10
-4408,mul_374,call_function,mul.Tensor,backward,19,2,2,1,4018,2164,8
-4413,add_200,call_function,add.Tensor,unknown,,2,2,1,4020,2162,10
-4419,einsum_default_318,call_function,einsum.default,backward,19,2,2,1,4023,2156,5
-4438,mul_376,call_function,mul.Tensor,backward,19,2,2,1,4037,2122,8
-4443,mul_377,call_function,mul.Tensor,backward,19,2,2,1,4037,2121,8
-4456,einsum_default_320,call_function,einsum.default,backward,19,2,2,1,4035,2112,5
-4463,einsum_default_322,call_function,einsum.default,backward,19,2,2,1,4044,2112,5
-4464,add_201,call_function,add.Tensor,unknown,,2,2,1,4051,2111,10
-4471,einsum_default_324,call_function,einsum.default,backward,19,2,2,1,4044,2111,5
-4472,add_202,call_function,add.Tensor,unknown,,2,2,1,4067,2110,10
-4481,mul_379,call_function,mul.Tensor,backward,19,2,2,1,2107,2108,8
-4480,mul_378,call_function,mul.Tensor,backward,19,2,2,1,4071,2102,8
-4484,mul_380,call_function,mul.Tensor,backward,19,2,2,1,4076,2100,8
-4487,mul_381,call_function,mul.Tensor,backward,19,2,2,1,4079,2098,8
-4488,sub_27,call_function,sub.Tensor,backward,19,2,2,1,4080,2097,10
-4489,mul_382,call_function,mul.Tensor,backward,19,2,2,1,4081,2096,8
-4494,add_203,call_function,add.Tensor,unknown,,2,2,1,4083,2094,10
-4500,einsum_default_326,call_function,einsum.default,backward,18,2,2,1,4086,2088,5
-4506,mul_385,call_function,mul.Tensor,backward,18,2,2,1,4088,2079,8
-4525,mul_388,call_function,mul.Tensor,backward,18,2,2,1,2085,2078,8
-4523,mul_387,call_function,mul.Tensor,backward,18,2,2,1,4098,2077,8
-4527,mul_389,call_function,mul.Tensor,backward,18,2,2,1,4102,2076,8
-4505,mul_384,call_function,mul.Tensor,backward,18,2,2,1,4088,2075,8
-4510,einsum_default_328,call_function,einsum.default,backward,18,2,2,1,4091,2069,5
-4532,einsum_default_330,call_function,einsum.default,backward,18,2,2,1,4106,2069,5
-4533,add_206,call_function,add.Tensor,unknown,,2,2,1,4111,2068,10
-4542,mul_391,call_function,mul.Tensor,backward,18,2,2,1,2064,2066,8
-4541,mul_390,call_function,mul.Tensor,backward,18,2,2,1,4115,2060,8
-4545,mul_392,call_function,mul.Tensor,backward,18,2,2,1,4120,2058,8
-4548,mul_393,call_function,mul.Tensor,backward,18,2,2,1,4123,2056,8
-4549,sub_29,call_function,sub.Tensor,backward,18,2,2,1,4124,2055,10
-4550,mul_394,call_function,mul.Tensor,backward,18,2,2,1,4125,2054,8
-4555,add_207,call_function,add.Tensor,unknown,,2,2,1,4127,2052,10
-4561,einsum_default_332,call_function,einsum.default,backward,18,2,2,1,4130,2046,5
-4580,mul_396,call_function,mul.Tensor,backward,18,2,2,1,4144,2012,8
-4585,mul_397,call_function,mul.Tensor,backward,18,2,2,1,4144,2011,8
-4598,einsum_default_334,call_function,einsum.default,backward,18,2,2,1,4142,2002,5
-4605,einsum_default_336,call_function,einsum.default,backward,18,2,2,1,4151,2002,5
-4606,add_208,call_function,add.Tensor,unknown,,2,2,1,4158,2001,10
-4613,einsum_default_338,call_function,einsum.default,backward,18,2,2,1,4151,2001,5
-4614,add_209,call_function,add.Tensor,unknown,,2,2,1,4174,2000,10
-4623,mul_399,call_function,mul.Tensor,backward,18,2,2,1,1997,1998,8
-4622,mul_398,call_function,mul.Tensor,backward,18,2,2,1,4178,1992,8
-4626,mul_400,call_function,mul.Tensor,backward,18,2,2,1,4183,1990,8
-4629,mul_401,call_function,mul.Tensor,backward,18,2,2,1,4186,1988,8
-4630,sub_30,call_function,sub.Tensor,backward,18,2,2,1,4187,1987,10
-4631,mul_402,call_function,mul.Tensor,backward,18,2,2,1,4188,1986,8
-4636,add_210,call_function,add.Tensor,unknown,,2,2,1,4190,1984,10
-4642,einsum_default_340,call_function,einsum.default,backward,17,2,2,1,4193,1978,5
-4648,mul_405,call_function,mul.Tensor,backward,17,2,2,1,4195,1969,8
-4667,mul_408,call_function,mul.Tensor,backward,17,2,2,1,1975,1968,8
-4665,mul_407,call_function,mul.Tensor,backward,17,2,2,1,4205,1967,8
-4669,mul_409,call_function,mul.Tensor,backward,17,2,2,1,4209,1966,8
-4647,mul_404,call_function,mul.Tensor,backward,17,2,2,1,4195,1965,8
-4652,einsum_default_342,call_function,einsum.default,backward,17,2,2,1,4198,1959,5
-4674,einsum_default_344,call_function,einsum.default,backward,17,2,2,1,4213,1959,5
-4675,add_213,call_function,add.Tensor,unknown,,2,2,1,4218,1958,10
-4684,mul_411,call_function,mul.Tensor,backward,17,2,2,1,1954,1956,8
-4683,mul_410,call_function,mul.Tensor,backward,17,2,2,1,4222,1950,8
-4687,mul_412,call_function,mul.Tensor,backward,17,2,2,1,4227,1948,8
-4690,mul_413,call_function,mul.Tensor,backward,17,2,2,1,4230,1946,8
-4691,sub_32,call_function,sub.Tensor,backward,17,2,2,1,4231,1945,10
-4692,mul_414,call_function,mul.Tensor,backward,17,2,2,1,4232,1944,8
-4697,add_214,call_function,add.Tensor,unknown,,2,2,1,4234,1942,10
-4703,einsum_default_346,call_function,einsum.default,backward,17,2,2,1,4237,1936,5
-4722,mul_416,call_function,mul.Tensor,backward,17,2,2,1,4251,1902,8
-4727,mul_417,call_function,mul.Tensor,backward,17,2,2,1,4251,1901,8
-4740,einsum_default_348,call_function,einsum.default,backward,17,2,2,1,4249,1892,5
-4747,einsum_default_350,call_function,einsum.default,backward,17,2,2,1,4258,1892,5
-4748,add_215,call_function,add.Tensor,unknown,,2,2,1,4265,1891,10
-4755,einsum_default_352,call_function,einsum.default,backward,17,2,2,1,4258,1891,5
-4756,add_216,call_function,add.Tensor,unknown,,2,2,1,4281,1890,10
-4765,mul_419,call_function,mul.Tensor,backward,17,2,2,1,1887,1888,8
-4764,mul_418,call_function,mul.Tensor,backward,17,2,2,1,4285,1882,8
-4768,mul_420,call_function,mul.Tensor,backward,17,2,2,1,4290,1880,8
-4771,mul_421,call_function,mul.Tensor,backward,17,2,2,1,4293,1878,8
-4772,sub_33,call_function,sub.Tensor,backward,17,2,2,1,4294,1877,10
-4773,mul_422,call_function,mul.Tensor,backward,17,2,2,1,4295,1876,8
-4778,add_217,call_function,add.Tensor,unknown,,2,2,1,4297,1874,10
-4784,einsum_default_354,call_function,einsum.default,backward,16,2,2,1,4300,1868,5
-4790,mul_425,call_function,mul.Tensor,backward,16,2,2,1,4302,1859,8
-4809,mul_428,call_function,mul.Tensor,backward,16,2,2,1,1865,1858,8
-4807,mul_427,call_function,mul.Tensor,backward,16,2,2,1,4312,1857,8
-4811,mul_429,call_function,mul.Tensor,backward,16,2,2,1,4316,1856,8
-4789,mul_424,call_function,mul.Tensor,backward,16,2,2,1,4302,1855,8
-4794,einsum_default_356,call_function,einsum.default,backward,16,2,2,1,4305,1849,5
-4816,einsum_default_358,call_function,einsum.default,backward,16,2,2,1,4320,1849,5
-4817,add_220,call_function,add.Tensor,unknown,,2,2,1,4325,1848,10
-4826,mul_431,call_function,mul.Tensor,backward,16,2,2,1,1844,1846,8
-4825,mul_430,call_function,mul.Tensor,backward,16,2,2,1,4329,1840,8
-4829,mul_432,call_function,mul.Tensor,backward,16,2,2,1,4334,1838,8
-4832,mul_433,call_function,mul.Tensor,backward,16,2,2,1,4337,1836,8
-4833,sub_35,call_function,sub.Tensor,backward,16,2,2,1,4338,1835,10
-4834,mul_434,call_function,mul.Tensor,backward,16,2,2,1,4339,1834,8
-4839,add_221,call_function,add.Tensor,unknown,,2,2,1,4341,1832,10
-4845,einsum_default_360,call_function,einsum.default,backward,16,2,2,1,4344,1826,5
-4864,mul_436,call_function,mul.Tensor,backward,16,2,2,1,4358,1792,8
-4869,mul_437,call_function,mul.Tensor,backward,16,2,2,1,4358,1791,8
-4882,einsum_default_362,call_function,einsum.default,backward,16,2,2,1,4356,1782,5
-4889,einsum_default_364,call_function,einsum.default,backward,16,2,2,1,4365,1782,5
-4890,add_222,call_function,add.Tensor,unknown,,2,2,1,4372,1781,10
-4897,einsum_default_366,call_function,einsum.default,backward,16,2,2,1,4365,1781,5
-4898,add_223,call_function,add.Tensor,unknown,,2,2,1,4388,1780,10
-4907,mul_439,call_function,mul.Tensor,backward,16,2,2,1,1777,1778,8
-4906,mul_438,call_function,mul.Tensor,backward,16,2,2,1,4392,1772,8
-4910,mul_440,call_function,mul.Tensor,backward,16,2,2,1,4397,1770,8
-4913,mul_441,call_function,mul.Tensor,backward,16,2,2,1,4400,1768,8
-4914,sub_36,call_function,sub.Tensor,backward,16,2,2,1,4401,1767,10
-4915,mul_442,call_function,mul.Tensor,backward,16,2,2,1,4402,1766,8
-4920,add_224,call_function,add.Tensor,unknown,,2,2,1,4404,1764,10
-4926,einsum_default_368,call_function,einsum.default,backward,15,2,2,1,4407,1758,5
-4932,mul_445,call_function,mul.Tensor,backward,15,2,2,1,4409,1749,8
-4951,mul_448,call_function,mul.Tensor,backward,15,2,2,1,1755,1748,8
-4949,mul_447,call_function,mul.Tensor,backward,15,2,2,1,4419,1747,8
-4953,mul_449,call_function,mul.Tensor,backward,15,2,2,1,4423,1746,8
-4931,mul_444,call_function,mul.Tensor,backward,15,2,2,1,4409,1745,8
-4936,einsum_default_370,call_function,einsum.default,backward,15,2,2,1,4412,1739,5
-4958,einsum_default_372,call_function,einsum.default,backward,15,2,2,1,4427,1739,5
-4959,add_227,call_function,add.Tensor,unknown,,2,2,1,4432,1738,10
-4968,mul_451,call_function,mul.Tensor,backward,15,2,2,1,1734,1736,8
-4967,mul_450,call_function,mul.Tensor,backward,15,2,2,1,4436,1730,8
-4971,mul_452,call_function,mul.Tensor,backward,15,2,2,1,4441,1728,8
-4974,mul_453,call_function,mul.Tensor,backward,15,2,2,1,4444,1726,8
-4975,sub_38,call_function,sub.Tensor,backward,15,2,2,1,4445,1725,10
-4976,mul_454,call_function,mul.Tensor,backward,15,2,2,1,4446,1724,8
-4981,add_228,call_function,add.Tensor,unknown,,2,2,1,4448,1722,10
-4987,einsum_default_374,call_function,einsum.default,backward,15,2,2,1,4451,1716,5
-5006,mul_456,call_function,mul.Tensor,backward,15,2,2,1,4465,1682,8
-5011,mul_457,call_function,mul.Tensor,backward,15,2,2,1,4465,1681,8
-5024,einsum_default_376,call_function,einsum.default,backward,15,2,2,1,4463,1672,5
-5031,einsum_default_378,call_function,einsum.default,backward,15,2,2,1,4472,1672,5
-5032,add_229,call_function,add.Tensor,unknown,,2,2,1,4479,1671,10
-5039,einsum_default_380,call_function,einsum.default,backward,15,2,2,1,4472,1671,5
-5040,add_230,call_function,add.Tensor,unknown,,2,2,1,4495,1670,10
-5049,mul_459,call_function,mul.Tensor,backward,15,2,2,1,1667,1668,8
-5048,mul_458,call_function,mul.Tensor,backward,15,2,2,1,4499,1662,8
-5052,mul_460,call_function,mul.Tensor,backward,15,2,2,1,4504,1660,8
-5055,mul_461,call_function,mul.Tensor,backward,15,2,2,1,4507,1658,8
-5056,sub_39,call_function,sub.Tensor,backward,15,2,2,1,4508,1657,10
-5057,mul_462,call_function,mul.Tensor,backward,15,2,2,1,4509,1656,8
-5062,add_231,call_function,add.Tensor,unknown,,2,2,1,4511,1654,10
-5068,einsum_default_382,call_function,einsum.default,backward,14,2,2,1,4514,1648,5
-5074,mul_465,call_function,mul.Tensor,backward,14,2,2,1,4516,1639,8
-5093,mul_468,call_function,mul.Tensor,backward,14,2,2,1,1645,1638,8
-5091,mul_467,call_function,mul.Tensor,backward,14,2,2,1,4526,1637,8
-5095,mul_469,call_function,mul.Tensor,backward,14,2,2,1,4530,1636,8
-5073,mul_464,call_function,mul.Tensor,backward,14,2,2,1,4516,1635,8
-5078,einsum_default_384,call_function,einsum.default,backward,14,2,2,1,4519,1629,5
-5100,einsum_default_386,call_function,einsum.default,backward,14,2,2,1,4534,1629,5
-5101,add_234,call_function,add.Tensor,unknown,,2,2,1,4539,1628,10
-5110,mul_471,call_function,mul.Tensor,backward,14,2,2,1,1624,1626,8
-5109,mul_470,call_function,mul.Tensor,backward,14,2,2,1,4543,1620,8
-5113,mul_472,call_function,mul.Tensor,backward,14,2,2,1,4548,1618,8
-5116,mul_473,call_function,mul.Tensor,backward,14,2,2,1,4551,1616,8
-5117,sub_41,call_function,sub.Tensor,backward,14,2,2,1,4552,1615,10
-5118,mul_474,call_function,mul.Tensor,backward,14,2,2,1,4553,1614,8
-5123,add_235,call_function,add.Tensor,unknown,,2,2,1,4555,1612,10
-5129,einsum_default_388,call_function,einsum.default,backward,14,2,2,1,4558,1606,5
-5148,mul_476,call_function,mul.Tensor,backward,14,2,2,1,4572,1572,8
-5153,mul_477,call_function,mul.Tensor,backward,14,2,2,1,4572,1571,8
-5166,einsum_default_390,call_function,einsum.default,backward,14,2,2,1,4570,1562,5
-5173,einsum_default_392,call_function,einsum.default,backward,14,2,2,1,4579,1562,5
-5174,add_236,call_function,add.Tensor,unknown,,2,2,1,4586,1561,10
-5181,einsum_default_394,call_function,einsum.default,backward,14,2,2,1,4579,1561,5
-5182,add_237,call_function,add.Tensor,unknown,,2,2,1,4602,1560,10
-5191,mul_479,call_function,mul.Tensor,backward,14,2,2,1,1557,1558,8
-5190,mul_478,call_function,mul.Tensor,backward,14,2,2,1,4606,1552,8
-5194,mul_480,call_function,mul.Tensor,backward,14,2,2,1,4611,1550,8
-5197,mul_481,call_function,mul.Tensor,backward,14,2,2,1,4614,1548,8
-5198,sub_42,call_function,sub.Tensor,backward,14,2,2,1,4615,1547,10
-5199,mul_482,call_function,mul.Tensor,backward,14,2,2,1,4616,1546,8
-5204,add_238,call_function,add.Tensor,unknown,,2,2,1,4618,1544,10
-5210,einsum_default_396,call_function,einsum.default,backward,13,2,2,1,4621,1538,5
-5216,mul_485,call_function,mul.Tensor,backward,13,2,2,1,4623,1529,8
-5235,mul_488,call_function,mul.Tensor,backward,13,2,2,1,1535,1528,8
-5233,mul_487,call_function,mul.Tensor,backward,13,2,2,1,4633,1527,8
-5237,mul_489,call_function,mul.Tensor,backward,13,2,2,1,4637,1526,8
-5215,mul_484,call_function,mul.Tensor,backward,13,2,2,1,4623,1525,8
-5220,einsum_default_398,call_function,einsum.default,backward,13,2,2,1,4626,1519,5
-5242,einsum_default_400,call_function,einsum.default,backward,13,2,2,1,4641,1519,5
-5243,add_241,call_function,add.Tensor,unknown,,2,2,1,4646,1518,10
-5252,mul_491,call_function,mul.Tensor,backward,13,2,2,1,1514,1516,8
-5251,mul_490,call_function,mul.Tensor,backward,13,2,2,1,4650,1510,8
-5255,mul_492,call_function,mul.Tensor,backward,13,2,2,1,4655,1508,8
-5258,mul_493,call_function,mul.Tensor,backward,13,2,2,1,4658,1506,8
-5259,sub_44,call_function,sub.Tensor,backward,13,2,2,1,4659,1505,10
-5260,mul_494,call_function,mul.Tensor,backward,13,2,2,1,4660,1504,8
-5265,add_242,call_function,add.Tensor,unknown,,2,2,1,4662,1502,10
-5271,einsum_default_402,call_function,einsum.default,backward,13,2,2,1,4665,1496,5
-5290,mul_496,call_function,mul.Tensor,backward,13,2,2,1,4679,1462,8
-5295,mul_497,call_function,mul.Tensor,backward,13,2,2,1,4679,1461,8
-5308,einsum_default_404,call_function,einsum.default,backward,13,2,2,1,4677,1452,5
-5315,einsum_default_406,call_function,einsum.default,backward,13,2,2,1,4686,1452,5
-5316,add_243,call_function,add.Tensor,unknown,,2,2,1,4693,1451,10
-5323,einsum_default_408,call_function,einsum.default,backward,13,2,2,1,4686,1451,5
-5324,add_244,call_function,add.Tensor,unknown,,2,2,1,4709,1450,10
-5333,mul_499,call_function,mul.Tensor,backward,13,2,2,1,1447,1448,8
-5332,mul_498,call_function,mul.Tensor,backward,13,2,2,1,4713,1442,8
-5336,mul_500,call_function,mul.Tensor,backward,13,2,2,1,4718,1440,8
-5339,mul_501,call_function,mul.Tensor,backward,13,2,2,1,4721,1438,8
-5340,sub_45,call_function,sub.Tensor,backward,13,2,2,1,4722,1437,10
-5341,mul_502,call_function,mul.Tensor,backward,13,2,2,1,4723,1436,8
-5346,add_245,call_function,add.Tensor,unknown,,2,2,1,4725,1434,10
-5352,einsum_default_410,call_function,einsum.default,backward,12,2,2,1,4728,1428,5
-5358,mul_505,call_function,mul.Tensor,backward,12,2,2,1,4730,1419,8
-5377,mul_508,call_function,mul.Tensor,backward,12,2,2,1,1425,1418,8
-5375,mul_507,call_function,mul.Tensor,backward,12,2,2,1,4740,1417,8
-5379,mul_509,call_function,mul.Tensor,backward,12,2,2,1,4744,1416,8
-5357,mul_504,call_function,mul.Tensor,backward,12,2,2,1,4730,1415,8
-5362,einsum_default_412,call_function,einsum.default,backward,12,2,2,1,4733,1409,5
-5384,einsum_default_414,call_function,einsum.default,backward,12,2,2,1,4748,1409,5
-5385,add_248,call_function,add.Tensor,unknown,,2,2,1,4753,1408,10
-5394,mul_511,call_function,mul.Tensor,backward,12,2,2,1,1404,1406,8
-5393,mul_510,call_function,mul.Tensor,backward,12,2,2,1,4757,1400,8
-5397,mul_512,call_function,mul.Tensor,backward,12,2,2,1,4762,1398,8
-5400,mul_513,call_function,mul.Tensor,backward,12,2,2,1,4765,1396,8
-5401,sub_47,call_function,sub.Tensor,backward,12,2,2,1,4766,1395,10
-5402,mul_514,call_function,mul.Tensor,backward,12,2,2,1,4767,1394,8
-5407,add_249,call_function,add.Tensor,unknown,,2,2,1,4769,1392,10
-5413,einsum_default_416,call_function,einsum.default,backward,12,2,2,1,4772,1386,5
-5432,mul_516,call_function,mul.Tensor,backward,12,2,2,1,4786,1352,8
-5437,mul_517,call_function,mul.Tensor,backward,12,2,2,1,4786,1351,8
-5450,einsum_default_418,call_function,einsum.default,backward,12,2,2,1,4784,1342,5
-5457,einsum_default_420,call_function,einsum.default,backward,12,2,2,1,4793,1342,5
-5458,add_250,call_function,add.Tensor,unknown,,2,2,1,4800,1341,10
-5465,einsum_default_422,call_function,einsum.default,backward,12,2,2,1,4793,1341,5
-5466,add_251,call_function,add.Tensor,unknown,,2,2,1,4816,1340,10
-5475,mul_519,call_function,mul.Tensor,backward,12,2,2,1,1337,1338,8
-5474,mul_518,call_function,mul.Tensor,backward,12,2,2,1,4820,1332,8
-5478,mul_520,call_function,mul.Tensor,backward,12,2,2,1,4825,1330,8
-5481,mul_521,call_function,mul.Tensor,backward,12,2,2,1,4828,1328,8
-5482,sub_48,call_function,sub.Tensor,backward,12,2,2,1,4829,1327,10
-5483,mul_522,call_function,mul.Tensor,backward,12,2,2,1,4830,1326,8
-5488,add_252,call_function,add.Tensor,unknown,,2,2,1,4832,1324,10
-5494,einsum_default_424,call_function,einsum.default,backward,11,2,2,1,4835,1318,5
-5500,mul_525,call_function,mul.Tensor,backward,11,2,2,1,4837,1309,8
-5519,mul_528,call_function,mul.Tensor,backward,11,2,2,1,1315,1308,8
-5517,mul_527,call_function,mul.Tensor,backward,11,2,2,1,4847,1307,8
-5521,mul_529,call_function,mul.Tensor,backward,11,2,2,1,4851,1306,8
-5499,mul_524,call_function,mul.Tensor,backward,11,2,2,1,4837,1305,8
-5504,einsum_default_426,call_function,einsum.default,backward,11,2,2,1,4840,1299,5
-5526,einsum_default_428,call_function,einsum.default,backward,11,2,2,1,4855,1299,5
-5527,add_255,call_function,add.Tensor,unknown,,2,2,1,4860,1298,10
-5536,mul_531,call_function,mul.Tensor,backward,11,2,2,1,1294,1296,8
-5535,mul_530,call_function,mul.Tensor,backward,11,2,2,1,4864,1290,8
-5539,mul_532,call_function,mul.Tensor,backward,11,2,2,1,4869,1288,8
-5542,mul_533,call_function,mul.Tensor,backward,11,2,2,1,4872,1286,8
-5543,sub_50,call_function,sub.Tensor,backward,11,2,2,1,4873,1285,10
-5544,mul_534,call_function,mul.Tensor,backward,11,2,2,1,4874,1284,8
-5549,add_256,call_function,add.Tensor,unknown,,2,2,1,4876,1282,10
-5555,einsum_default_430,call_function,einsum.default,backward,11,2,2,1,4879,1276,5
-5574,mul_536,call_function,mul.Tensor,backward,11,2,2,1,4893,1242,8
-5579,mul_537,call_function,mul.Tensor,backward,11,2,2,1,4893,1241,8
-5592,einsum_default_432,call_function,einsum.default,backward,11,2,2,1,4891,1232,5
-5599,einsum_default_434,call_function,einsum.default,backward,11,2,2,1,4900,1232,5
-5600,add_257,call_function,add.Tensor,unknown,,2,2,1,4907,1231,10
-5607,einsum_default_436,call_function,einsum.default,backward,11,2,2,1,4900,1231,5
-5608,add_258,call_function,add.Tensor,unknown,,2,2,1,4923,1230,10
-5617,mul_539,call_function,mul.Tensor,backward,11,2,2,1,1227,1228,8
-5616,mul_538,call_function,mul.Tensor,backward,11,2,2,1,4927,1222,8
-5620,mul_540,call_function,mul.Tensor,backward,11,2,2,1,4932,1220,8
-5623,mul_541,call_function,mul.Tensor,backward,11,2,2,1,4935,1218,8
-5624,sub_51,call_function,sub.Tensor,backward,11,2,2,1,4936,1217,10
-5625,mul_542,call_function,mul.Tensor,backward,11,2,2,1,4937,1216,8
-5630,add_259,call_function,add.Tensor,unknown,,2,2,1,4939,1214,10
-5636,einsum_default_438,call_function,einsum.default,backward,10,2,2,1,4942,1208,5
-5642,mul_545,call_function,mul.Tensor,backward,10,2,2,1,4944,1199,8
-5661,mul_548,call_function,mul.Tensor,backward,10,2,2,1,1205,1198,8
-5659,mul_547,call_function,mul.Tensor,backward,10,2,2,1,4954,1197,8
-5663,mul_549,call_function,mul.Tensor,backward,10,2,2,1,4958,1196,8
-5641,mul_544,call_function,mul.Tensor,backward,10,2,2,1,4944,1195,8
-5646,einsum_default_440,call_function,einsum.default,backward,10,2,2,1,4947,1189,5
-5668,einsum_default_442,call_function,einsum.default,backward,10,2,2,1,4962,1189,5
-5669,add_262,call_function,add.Tensor,unknown,,2,2,1,4967,1188,10
-5678,mul_551,call_function,mul.Tensor,backward,10,2,2,1,1184,1186,8
-5677,mul_550,call_function,mul.Tensor,backward,10,2,2,1,4971,1180,8
-5681,mul_552,call_function,mul.Tensor,backward,10,2,2,1,4976,1178,8
-5684,mul_553,call_function,mul.Tensor,backward,10,2,2,1,4979,1176,8
-5685,sub_53,call_function,sub.Tensor,backward,10,2,2,1,4980,1175,10
-5686,mul_554,call_function,mul.Tensor,backward,10,2,2,1,4981,1174,8
-5691,add_263,call_function,add.Tensor,unknown,,2,2,1,4983,1172,10
-5697,einsum_default_444,call_function,einsum.default,backward,10,2,2,1,4986,1166,5
-5716,mul_556,call_function,mul.Tensor,backward,10,2,2,1,5000,1132,8
-5721,mul_557,call_function,mul.Tensor,backward,10,2,2,1,5000,1131,8
-5734,einsum_default_446,call_function,einsum.default,backward,10,2,2,1,4998,1122,5
-5741,einsum_default_448,call_function,einsum.default,backward,10,2,2,1,5007,1122,5
-5742,add_264,call_function,add.Tensor,unknown,,2,2,1,5014,1121,10
-5749,einsum_default_450,call_function,einsum.default,backward,10,2,2,1,5007,1121,5
-5750,add_265,call_function,add.Tensor,unknown,,2,2,1,5030,1120,10
-5759,mul_559,call_function,mul.Tensor,backward,10,2,2,1,1117,1118,8
-5758,mul_558,call_function,mul.Tensor,backward,10,2,2,1,5034,1112,8
-5762,mul_560,call_function,mul.Tensor,backward,10,2,2,1,5039,1110,8
-5765,mul_561,call_function,mul.Tensor,backward,10,2,2,1,5042,1108,8
-5766,sub_54,call_function,sub.Tensor,backward,10,2,2,1,5043,1107,10
-5767,mul_562,call_function,mul.Tensor,backward,10,2,2,1,5044,1106,8
-5772,add_266,call_function,add.Tensor,unknown,,2,2,1,5046,1104,10
-5778,einsum_default_452,call_function,einsum.default,backward,9,2,2,1,5049,1098,5
-5784,mul_565,call_function,mul.Tensor,backward,9,2,2,1,5051,1089,8
-5803,mul_568,call_function,mul.Tensor,backward,9,2,2,1,1095,1088,8
-5801,mul_567,call_function,mul.Tensor,backward,9,2,2,1,5061,1087,8
-5805,mul_569,call_function,mul.Tensor,backward,9,2,2,1,5065,1086,8
-5783,mul_564,call_function,mul.Tensor,backward,9,2,2,1,5051,1085,8
-5788,einsum_default_454,call_function,einsum.default,backward,9,2,2,1,5054,1079,5
-5810,einsum_default_456,call_function,einsum.default,backward,9,2,2,1,5069,1079,5
-5811,add_269,call_function,add.Tensor,unknown,,2,2,1,5074,1078,10
-5820,mul_571,call_function,mul.Tensor,backward,9,2,2,1,1074,1076,8
-5819,mul_570,call_function,mul.Tensor,backward,9,2,2,1,5078,1070,8
-5823,mul_572,call_function,mul.Tensor,backward,9,2,2,1,5083,1068,8
-5826,mul_573,call_function,mul.Tensor,backward,9,2,2,1,5086,1066,8
-5827,sub_56,call_function,sub.Tensor,backward,9,2,2,1,5087,1065,10
-5828,mul_574,call_function,mul.Tensor,backward,9,2,2,1,5088,1064,8
-5833,add_270,call_function,add.Tensor,unknown,,2,2,1,5090,1062,10
-5839,einsum_default_458,call_function,einsum.default,backward,9,2,2,1,5093,1056,5
-5858,mul_576,call_function,mul.Tensor,backward,9,2,2,1,5107,1022,8
-5863,mul_577,call_function,mul.Tensor,backward,9,2,2,1,5107,1021,8
-5876,einsum_default_460,call_function,einsum.default,backward,9,2,2,1,5105,1012,5
-5883,einsum_default_462,call_function,einsum.default,backward,9,2,2,1,5114,1012,5
-5884,add_271,call_function,add.Tensor,unknown,,2,2,1,5121,1011,10
-5891,einsum_default_464,call_function,einsum.default,backward,9,2,2,1,5114,1011,5
-5892,add_272,call_function,add.Tensor,unknown,,2,2,1,5137,1010,10
-5901,mul_579,call_function,mul.Tensor,backward,9,2,2,1,1007,1008,8
-5900,mul_578,call_function,mul.Tensor,backward,9,2,2,1,5141,1002,8
-5904,mul_580,call_function,mul.Tensor,backward,9,2,2,1,5146,1000,8
-5907,mul_581,call_function,mul.Tensor,backward,9,2,2,1,5149,998,8
-5908,sub_57,call_function,sub.Tensor,backward,9,2,2,1,5150,997,10
-5909,mul_582,call_function,mul.Tensor,backward,9,2,2,1,5151,996,8
-5914,add_273,call_function,add.Tensor,unknown,,2,2,1,5153,994,10
-5920,einsum_default_466,call_function,einsum.default,backward,8,2,2,1,5156,988,5
-5926,mul_585,call_function,mul.Tensor,backward,8,2,2,1,5158,979,8
-5945,mul_588,call_function,mul.Tensor,backward,8,2,2,1,985,978,8
-5943,mul_587,call_function,mul.Tensor,backward,8,2,2,1,5168,977,8
-5947,mul_589,call_function,mul.Tensor,backward,8,2,2,1,5172,976,8
-5925,mul_584,call_function,mul.Tensor,backward,8,2,2,1,5158,975,8
-5930,einsum_default_468,call_function,einsum.default,backward,8,2,2,1,5161,969,5
-5952,einsum_default_470,call_function,einsum.default,backward,8,2,2,1,5176,969,5
-5953,add_276,call_function,add.Tensor,unknown,,2,2,1,5181,968,10
-5962,mul_591,call_function,mul.Tensor,backward,8,2,2,1,964,966,8
-5961,mul_590,call_function,mul.Tensor,backward,8,2,2,1,5185,960,8
-5965,mul_592,call_function,mul.Tensor,backward,8,2,2,1,5190,958,8
-5968,mul_593,call_function,mul.Tensor,backward,8,2,2,1,5193,956,8
-5969,sub_59,call_function,sub.Tensor,backward,8,2,2,1,5194,955,10
-5970,mul_594,call_function,mul.Tensor,backward,8,2,2,1,5195,954,8
-5975,add_277,call_function,add.Tensor,unknown,,2,2,1,5197,952,10
-5981,einsum_default_472,call_function,einsum.default,backward,8,2,2,1,5200,946,5
-6000,mul_596,call_function,mul.Tensor,backward,8,2,2,1,5214,912,8
-6005,mul_597,call_function,mul.Tensor,backward,8,2,2,1,5214,911,8
-6018,einsum_default_474,call_function,einsum.default,backward,8,2,2,1,5212,902,5
-6025,einsum_default_476,call_function,einsum.default,backward,8,2,2,1,5221,902,5
-6026,add_278,call_function,add.Tensor,unknown,,2,2,1,5228,901,10
-6033,einsum_default_478,call_function,einsum.default,backward,8,2,2,1,5221,901,5
-6034,add_279,call_function,add.Tensor,unknown,,2,2,1,5244,900,10
-6043,mul_599,call_function,mul.Tensor,backward,8,2,2,1,897,898,8
-6042,mul_598,call_function,mul.Tensor,backward,8,2,2,1,5248,892,8
-6046,mul_600,call_function,mul.Tensor,backward,8,2,2,1,5253,890,8
-6049,mul_601,call_function,mul.Tensor,backward,8,2,2,1,5256,888,8
-6050,sub_60,call_function,sub.Tensor,backward,8,2,2,1,5257,887,10
-6051,mul_602,call_function,mul.Tensor,backward,8,2,2,1,5258,886,8
-6056,add_280,call_function,add.Tensor,unknown,,2,2,1,5260,884,10
-6062,einsum_default_480,call_function,einsum.default,backward,7,2,2,1,5263,878,5
-6068,mul_605,call_function,mul.Tensor,backward,7,2,2,1,5265,869,8
-6087,mul_608,call_function,mul.Tensor,backward,7,2,2,1,875,868,8
-6085,mul_607,call_function,mul.Tensor,backward,7,2,2,1,5275,867,8
-6089,mul_609,call_function,mul.Tensor,backward,7,2,2,1,5279,866,8
-6067,mul_604,call_function,mul.Tensor,backward,7,2,2,1,5265,865,8
-6072,einsum_default_482,call_function,einsum.default,backward,7,2,2,1,5268,859,5
-6094,einsum_default_484,call_function,einsum.default,backward,7,2,2,1,5283,859,5
-6095,add_283,call_function,add.Tensor,unknown,,2,2,1,5288,858,10
-6104,mul_611,call_function,mul.Tensor,backward,7,2,2,1,854,856,8
-6103,mul_610,call_function,mul.Tensor,backward,7,2,2,1,5292,850,8
-6107,mul_612,call_function,mul.Tensor,backward,7,2,2,1,5297,848,8
-6110,mul_613,call_function,mul.Tensor,backward,7,2,2,1,5300,846,8
-6111,sub_62,call_function,sub.Tensor,backward,7,2,2,1,5301,845,10
-6112,mul_614,call_function,mul.Tensor,backward,7,2,2,1,5302,844,8
-6117,add_284,call_function,add.Tensor,unknown,,2,2,1,5304,842,10
-6123,einsum_default_486,call_function,einsum.default,backward,7,2,2,1,5307,836,5
-6142,mul_616,call_function,mul.Tensor,backward,7,2,2,1,5321,802,8
-6147,mul_617,call_function,mul.Tensor,backward,7,2,2,1,5321,801,8
-6160,einsum_default_488,call_function,einsum.default,backward,7,2,2,1,5319,792,5
-6167,einsum_default_490,call_function,einsum.default,backward,7,2,2,1,5328,792,5
-6168,add_285,call_function,add.Tensor,unknown,,2,2,1,5335,791,10
-6175,einsum_default_492,call_function,einsum.default,backward,7,2,2,1,5328,791,5
-6176,add_286,call_function,add.Tensor,unknown,,2,2,1,5351,790,10
-6185,mul_619,call_function,mul.Tensor,backward,7,2,2,1,787,788,8
-6184,mul_618,call_function,mul.Tensor,backward,7,2,2,1,5355,782,8
-6188,mul_620,call_function,mul.Tensor,backward,7,2,2,1,5360,780,8
-6191,mul_621,call_function,mul.Tensor,backward,7,2,2,1,5363,778,8
-6192,sub_63,call_function,sub.Tensor,backward,7,2,2,1,5364,777,10
-6193,mul_622,call_function,mul.Tensor,backward,7,2,2,1,5365,776,8
-6198,add_287,call_function,add.Tensor,unknown,,2,2,1,5367,774,10
-6204,einsum_default_494,call_function,einsum.default,backward,6,2,2,1,5370,768,5
-6210,mul_625,call_function,mul.Tensor,backward,6,2,2,1,5372,759,8
-6229,mul_628,call_function,mul.Tensor,backward,6,2,2,1,765,758,8
-6227,mul_627,call_function,mul.Tensor,backward,6,2,2,1,5382,757,8
-6231,mul_629,call_function,mul.Tensor,backward,6,2,2,1,5386,756,8
-6209,mul_624,call_function,mul.Tensor,backward,6,2,2,1,5372,755,8
-6214,einsum_default_496,call_function,einsum.default,backward,6,2,2,1,5375,749,5
-6236,einsum_default_498,call_function,einsum.default,backward,6,2,2,1,5390,749,5
-6237,add_290,call_function,add.Tensor,unknown,,2,2,1,5395,748,10
-6246,mul_631,call_function,mul.Tensor,backward,6,2,2,1,744,746,8
-6245,mul_630,call_function,mul.Tensor,backward,6,2,2,1,5399,740,8
-6249,mul_632,call_function,mul.Tensor,backward,6,2,2,1,5404,738,8
-6252,mul_633,call_function,mul.Tensor,backward,6,2,2,1,5407,736,8
-6253,sub_65,call_function,sub.Tensor,backward,6,2,2,1,5408,735,10
-6254,mul_634,call_function,mul.Tensor,backward,6,2,2,1,5409,734,8
-6259,add_291,call_function,add.Tensor,unknown,,2,2,1,5411,732,10
-6265,einsum_default_500,call_function,einsum.default,backward,6,2,2,1,5414,726,5
-6284,mul_636,call_function,mul.Tensor,backward,6,2,2,1,5428,692,8
-6289,mul_637,call_function,mul.Tensor,backward,6,2,2,1,5428,691,8
-6302,einsum_default_502,call_function,einsum.default,backward,6,2,2,1,5426,682,5
-6309,einsum_default_504,call_function,einsum.default,backward,6,2,2,1,5435,682,5
-6310,add_292,call_function,add.Tensor,unknown,,2,2,1,5442,681,10
-6317,einsum_default_506,call_function,einsum.default,backward,6,2,2,1,5435,681,5
-6318,add_293,call_function,add.Tensor,unknown,,2,2,1,5458,680,10
-6327,mul_639,call_function,mul.Tensor,backward,6,2,2,1,677,678,8
-6326,mul_638,call_function,mul.Tensor,backward,6,2,2,1,5462,672,8
-6330,mul_640,call_function,mul.Tensor,backward,6,2,2,1,5467,670,8
-6333,mul_641,call_function,mul.Tensor,backward,6,2,2,1,5470,668,8
-6334,sub_66,call_function,sub.Tensor,backward,6,2,2,1,5471,667,10
-6335,mul_642,call_function,mul.Tensor,backward,6,2,2,1,5472,666,8
-6340,add_294,call_function,add.Tensor,unknown,,2,2,1,5474,664,10
-6346,einsum_default_508,call_function,einsum.default,backward,5,2,2,1,5477,658,5
-6352,mul_645,call_function,mul.Tensor,backward,5,2,2,1,5479,649,8
-6371,mul_648,call_function,mul.Tensor,backward,5,2,2,1,655,648,8
-6369,mul_647,call_function,mul.Tensor,backward,5,2,2,1,5489,647,8
-6373,mul_649,call_function,mul.Tensor,backward,5,2,2,1,5493,646,8
-6351,mul_644,call_function,mul.Tensor,backward,5,2,2,1,5479,645,8
-6356,einsum_default_510,call_function,einsum.default,backward,5,2,2,1,5482,639,5
-6378,einsum_default_512,call_function,einsum.default,backward,5,2,2,1,5497,639,5
-6379,add_297,call_function,add.Tensor,unknown,,2,2,1,5502,638,10
-6388,mul_651,call_function,mul.Tensor,backward,5,2,2,1,634,636,8
-6387,mul_650,call_function,mul.Tensor,backward,5,2,2,1,5506,630,8
-6391,mul_652,call_function,mul.Tensor,backward,5,2,2,1,5511,628,8
-6394,mul_653,call_function,mul.Tensor,backward,5,2,2,1,5514,626,8
-6395,sub_68,call_function,sub.Tensor,backward,5,2,2,1,5515,625,10
-6396,mul_654,call_function,mul.Tensor,backward,5,2,2,1,5516,624,8
-6401,add_298,call_function,add.Tensor,unknown,,2,2,1,5518,622,10
-6407,einsum_default_514,call_function,einsum.default,backward,5,2,2,1,5521,616,5
-6426,mul_656,call_function,mul.Tensor,backward,5,2,2,1,5535,582,8
-6431,mul_657,call_function,mul.Tensor,backward,5,2,2,1,5535,581,8
-6444,einsum_default_516,call_function,einsum.default,backward,5,2,2,1,5533,572,5
-6451,einsum_default_518,call_function,einsum.default,backward,5,2,2,1,5542,572,5
-6452,add_299,call_function,add.Tensor,unknown,,2,2,1,5549,571,10
-6459,einsum_default_520,call_function,einsum.default,backward,5,2,2,1,5542,571,5
-6460,add_300,call_function,add.Tensor,unknown,,2,2,1,5565,570,10
-6469,mul_659,call_function,mul.Tensor,backward,5,2,2,1,567,568,8
-6468,mul_658,call_function,mul.Tensor,backward,5,2,2,1,5569,562,8
-6472,mul_660,call_function,mul.Tensor,backward,5,2,2,1,5574,560,8
-6475,mul_661,call_function,mul.Tensor,backward,5,2,2,1,5577,558,8
-6476,sub_69,call_function,sub.Tensor,backward,5,2,2,1,5578,557,10
-6477,mul_662,call_function,mul.Tensor,backward,5,2,2,1,5579,556,8
-6482,add_301,call_function,add.Tensor,unknown,,2,2,1,5581,554,10
-6488,einsum_default_522,call_function,einsum.default,backward,4,2,2,1,5584,548,5
-6494,mul_665,call_function,mul.Tensor,backward,4,2,2,1,5586,539,8
-6513,mul_668,call_function,mul.Tensor,backward,4,2,2,1,545,538,8
-6511,mul_667,call_function,mul.Tensor,backward,4,2,2,1,5596,537,8
-6515,mul_669,call_function,mul.Tensor,backward,4,2,2,1,5600,536,8
-6493,mul_664,call_function,mul.Tensor,backward,4,2,2,1,5586,535,8
-6498,einsum_default_524,call_function,einsum.default,backward,4,2,2,1,5589,529,5
-6520,einsum_default_526,call_function,einsum.default,backward,4,2,2,1,5604,529,5
-6521,add_304,call_function,add.Tensor,unknown,,2,2,1,5609,528,10
-6530,mul_671,call_function,mul.Tensor,backward,4,2,2,1,524,526,8
-6529,mul_670,call_function,mul.Tensor,backward,4,2,2,1,5613,520,8
-6533,mul_672,call_function,mul.Tensor,backward,4,2,2,1,5618,518,8
-6536,mul_673,call_function,mul.Tensor,backward,4,2,2,1,5621,516,8
-6537,sub_71,call_function,sub.Tensor,backward,4,2,2,1,5622,515,10
-6538,mul_674,call_function,mul.Tensor,backward,4,2,2,1,5623,514,8
-6543,add_305,call_function,add.Tensor,unknown,,2,2,1,5625,512,10
-6549,einsum_default_528,call_function,einsum.default,backward,4,2,2,1,5628,506,5
-6568,mul_676,call_function,mul.Tensor,backward,4,2,2,1,5642,472,8
-6573,mul_677,call_function,mul.Tensor,backward,4,2,2,1,5642,471,8
-6586,einsum_default_530,call_function,einsum.default,backward,4,2,2,1,5640,462,5
-6593,einsum_default_532,call_function,einsum.default,backward,4,2,2,1,5649,462,5
-6594,add_306,call_function,add.Tensor,unknown,,2,2,1,5656,461,10
-6601,einsum_default_534,call_function,einsum.default,backward,4,2,2,1,5649,461,5
-6602,add_307,call_function,add.Tensor,unknown,,2,2,1,5672,460,10
-6611,mul_679,call_function,mul.Tensor,backward,4,2,2,1,457,458,8
-6610,mul_678,call_function,mul.Tensor,backward,4,2,2,1,5676,452,8
-6614,mul_680,call_function,mul.Tensor,backward,4,2,2,1,5681,450,8
-6617,mul_681,call_function,mul.Tensor,backward,4,2,2,1,5684,448,8
-6618,sub_72,call_function,sub.Tensor,backward,4,2,2,1,5685,447,10
-6619,mul_682,call_function,mul.Tensor,backward,4,2,2,1,5686,446,8
-6624,add_308,call_function,add.Tensor,unknown,,2,2,1,5688,444,10
-6630,einsum_default_536,call_function,einsum.default,backward,3,2,2,1,5691,438,5
-6636,mul_685,call_function,mul.Tensor,backward,3,2,2,1,5693,429,8
-6655,mul_688,call_function,mul.Tensor,backward,3,2,2,1,435,428,8
-6653,mul_687,call_function,mul.Tensor,backward,3,2,2,1,5703,427,8
-6657,mul_689,call_function,mul.Tensor,backward,3,2,2,1,5707,426,8
-6635,mul_684,call_function,mul.Tensor,backward,3,2,2,1,5693,425,8
-6640,einsum_default_538,call_function,einsum.default,backward,3,2,2,1,5696,419,5
-6662,einsum_default_540,call_function,einsum.default,backward,3,2,2,1,5711,419,5
-6663,add_311,call_function,add.Tensor,unknown,,2,2,1,5716,418,10
-6672,mul_691,call_function,mul.Tensor,backward,3,2,2,1,414,416,8
-6671,mul_690,call_function,mul.Tensor,backward,3,2,2,1,5720,410,8
-6675,mul_692,call_function,mul.Tensor,backward,3,2,2,1,5725,408,8
-6678,mul_693,call_function,mul.Tensor,backward,3,2,2,1,5728,406,8
-6679,sub_74,call_function,sub.Tensor,backward,3,2,2,1,5729,405,10
-6680,mul_694,call_function,mul.Tensor,backward,3,2,2,1,5730,404,8
-6685,add_312,call_function,add.Tensor,unknown,,2,2,1,5732,402,10
-6691,einsum_default_542,call_function,einsum.default,backward,3,2,2,1,5735,396,5
-6710,mul_696,call_function,mul.Tensor,backward,3,2,2,1,5749,362,8
-6715,mul_697,call_function,mul.Tensor,backward,3,2,2,1,5749,361,8
-6728,einsum_default_544,call_function,einsum.default,backward,3,2,2,1,5747,352,5
-6735,einsum_default_546,call_function,einsum.default,backward,3,2,2,1,5756,352,5
-6736,add_313,call_function,add.Tensor,unknown,,2,2,1,5763,351,10
-6743,einsum_default_548,call_function,einsum.default,backward,3,2,2,1,5756,351,5
-6744,add_314,call_function,add.Tensor,unknown,,2,2,1,5779,350,10
-6753,mul_699,call_function,mul.Tensor,backward,3,2,2,1,347,348,8
-6752,mul_698,call_function,mul.Tensor,backward,3,2,2,1,5783,342,8
-6756,mul_700,call_function,mul.Tensor,backward,3,2,2,1,5788,340,8
-6759,mul_701,call_function,mul.Tensor,backward,3,2,2,1,5791,338,8
-6760,sub_75,call_function,sub.Tensor,backward,3,2,2,1,5792,337,10
-6761,mul_702,call_function,mul.Tensor,backward,3,2,2,1,5793,336,8
-6766,add_315,call_function,add.Tensor,unknown,,2,2,1,5795,334,10
-6772,einsum_default_550,call_function,einsum.default,backward,2,2,2,1,5798,328,5
-6778,mul_705,call_function,mul.Tensor,backward,2,2,2,1,5800,319,8
-6797,mul_708,call_function,mul.Tensor,backward,2,2,2,1,325,318,8
-6795,mul_707,call_function,mul.Tensor,backward,2,2,2,1,5810,317,8
-6799,mul_709,call_function,mul.Tensor,backward,2,2,2,1,5814,316,8
-6777,mul_704,call_function,mul.Tensor,backward,2,2,2,1,5800,315,8
-6782,einsum_default_552,call_function,einsum.default,backward,2,2,2,1,5803,309,5
-6804,einsum_default_554,call_function,einsum.default,backward,2,2,2,1,5818,309,5
-6805,add_318,call_function,add.Tensor,unknown,,2,2,1,5823,308,10
-6814,mul_711,call_function,mul.Tensor,backward,2,2,2,1,304,306,8
-6813,mul_710,call_function,mul.Tensor,backward,2,2,2,1,5827,300,8
-6817,mul_712,call_function,mul.Tensor,backward,2,2,2,1,5832,298,8
-6820,mul_713,call_function,mul.Tensor,backward,2,2,2,1,5835,296,8
-6821,sub_77,call_function,sub.Tensor,backward,2,2,2,1,5836,295,10
-6822,mul_714,call_function,mul.Tensor,backward,2,2,2,1,5837,294,8
-6827,add_319,call_function,add.Tensor,unknown,,2,2,1,5839,292,10
-6833,einsum_default_556,call_function,einsum.default,backward,2,2,2,1,5842,286,5
-6852,mul_716,call_function,mul.Tensor,backward,2,2,2,1,5856,252,8
-6857,mul_717,call_function,mul.Tensor,backward,2,2,2,1,5856,251,8
-6870,einsum_default_558,call_function,einsum.default,backward,2,2,2,1,5854,242,5
-6877,einsum_default_560,call_function,einsum.default,backward,2,2,2,1,5863,242,5
-6878,add_320,call_function,add.Tensor,unknown,,2,2,1,5870,241,10
-6885,einsum_default_562,call_function,einsum.default,backward,2,2,2,1,5863,241,5
-6886,add_321,call_function,add.Tensor,unknown,,2,2,1,5886,240,10
-6895,mul_719,call_function,mul.Tensor,backward,2,2,2,1,237,238,8
-6894,mul_718,call_function,mul.Tensor,backward,2,2,2,1,5890,232,8
-6898,mul_720,call_function,mul.Tensor,backward,2,2,2,1,5895,230,8
-6901,mul_721,call_function,mul.Tensor,backward,2,2,2,1,5898,228,8
-6902,sub_78,call_function,sub.Tensor,backward,2,2,2,1,5899,227,10
-6903,mul_722,call_function,mul.Tensor,backward,2,2,2,1,5900,226,8
-6908,add_322,call_function,add.Tensor,unknown,,2,2,1,5902,224,10
-6914,einsum_default_564,call_function,einsum.default,backward,1,2,2,1,5905,218,5
-6920,mul_725,call_function,mul.Tensor,backward,1,2,2,1,5907,209,8
-6939,mul_728,call_function,mul.Tensor,backward,1,2,2,1,215,208,8
-6937,mul_727,call_function,mul.Tensor,backward,1,2,2,1,5917,207,8
-6941,mul_729,call_function,mul.Tensor,backward,1,2,2,1,5921,206,8
-6919,mul_724,call_function,mul.Tensor,backward,1,2,2,1,5907,205,8
-6924,einsum_default_566,call_function,einsum.default,backward,1,2,2,1,5910,199,5
-6946,einsum_default_568,call_function,einsum.default,backward,1,2,2,1,5925,199,5
-6947,add_325,call_function,add.Tensor,unknown,,2,2,1,5930,198,10
-6956,mul_731,call_function,mul.Tensor,backward,1,2,2,1,194,196,8
-6955,mul_730,call_function,mul.Tensor,backward,1,2,2,1,5934,190,8
-6959,mul_732,call_function,mul.Tensor,backward,1,2,2,1,5939,188,8
-6962,mul_733,call_function,mul.Tensor,backward,1,2,2,1,5942,186,8
-6963,sub_80,call_function,sub.Tensor,backward,1,2,2,1,5943,185,10
-6964,mul_734,call_function,mul.Tensor,backward,1,2,2,1,5944,184,8
-6969,add_326,call_function,add.Tensor,unknown,,2,2,1,5946,182,10
-6975,einsum_default_570,call_function,einsum.default,backward,1,2,2,1,5949,176,5
-6994,mul_736,call_function,mul.Tensor,backward,1,2,2,1,5963,142,8
-6999,mul_737,call_function,mul.Tensor,backward,1,2,2,1,5963,141,8
-7012,einsum_default_572,call_function,einsum.default,backward,1,2,2,1,5961,132,5
-7019,einsum_default_574,call_function,einsum.default,backward,1,2,2,1,5970,132,5
-7020,add_327,call_function,add.Tensor,unknown,,2,2,1,5977,131,10
-7027,einsum_default_576,call_function,einsum.default,backward,1,2,2,1,5970,131,5
-7028,add_328,call_function,add.Tensor,unknown,,2,2,1,5993,130,10
-7037,mul_739,call_function,mul.Tensor,backward,1,2,2,1,127,128,8
-7036,mul_738,call_function,mul.Tensor,backward,1,2,2,1,5997,122,8
-7040,mul_740,call_function,mul.Tensor,backward,1,2,2,1,6002,120,8
-7043,mul_741,call_function,mul.Tensor,backward,1,2,2,1,6005,118,8
-7044,sub_81,call_function,sub.Tensor,backward,1,2,2,1,6006,117,10
-7045,mul_742,call_function,mul.Tensor,backward,1,2,2,1,6007,116,8
-7050,add_329,call_function,add.Tensor,unknown,,2,2,1,6009,114,10
-7056,einsum_default_578,call_function,einsum.default,backward,0,2,2,1,6012,108,5
-7062,mul_745,call_function,mul.Tensor,backward,0,2,2,1,6014,99,8
-7081,mul_748,call_function,mul.Tensor,backward,0,2,2,1,105,98,8
-7079,mul_747,call_function,mul.Tensor,backward,0,2,2,1,6024,97,8
-7083,mul_749,call_function,mul.Tensor,backward,0,2,2,1,6028,96,8
-7061,mul_744,call_function,mul.Tensor,backward,0,2,2,1,6014,95,8
-7066,einsum_default_580,call_function,einsum.default,backward,0,2,2,1,6017,89,5
-7088,einsum_default_582,call_function,einsum.default,backward,0,2,2,1,6032,89,5
-7089,add_332,call_function,add.Tensor,unknown,,2,2,1,6037,88,10
-7098,mul_751,call_function,mul.Tensor,backward,0,2,2,1,84,86,8
-7097,mul_750,call_function,mul.Tensor,backward,0,2,2,1,6041,80,8
-7101,mul_752,call_function,mul.Tensor,backward,0,2,2,1,6046,78,8
-7104,mul_753,call_function,mul.Tensor,backward,0,2,2,1,6049,76,8
-7105,sub_83,call_function,sub.Tensor,backward,0,2,2,1,6050,75,10
-7106,mul_754,call_function,mul.Tensor,backward,0,2,2,1,6051,74,8
-7111,add_333,call_function,add.Tensor,unknown,,2,2,1,6053,72,10
-7117,einsum_default_584,call_function,einsum.default,backward,0,2,2,1,6056,66,5
-7136,mul_756,call_function,mul.Tensor,backward,0,2,2,1,6070,32,8
-7141,mul_757,call_function,mul.Tensor,backward,0,2,2,1,6070,31,8
-7154,einsum_default_586,call_function,einsum.default,backward,0,2,2,1,6068,22,5
-7161,einsum_default_588,call_function,einsum.default,backward,0,2,2,1,6077,22,5
-7162,add_334,call_function,add.Tensor,unknown,,2,2,1,6084,21,10
-7169,einsum_default_590,call_function,einsum.default,backward,0,2,2,1,6077,21,5
-7170,add_335,call_function,add.Tensor,unknown,,2,2,1,6100,20,10
-7179,mul_759,call_function,mul.Tensor,backward,0,2,2,1,15,18,8
-7178,mul_758,call_function,mul.Tensor,backward,0,2,2,1,6104,12,8
-3183,mul_196,call_function,mul.Tensor,forward,,2,2,1,3096,10,8
-7182,mul_760,call_function,mul.Tensor,backward,0,2,2,1,6109,10,8
-3185,mul_197,call_function,mul.Tensor,forward,,2,2,1,3100,9,8
-7185,mul_761,call_function,mul.Tensor,backward,0,2,2,1,6112,8,8
-7186,sub_84,call_function,sub.Tensor,backward,0,2,2,1,6113,7,10
-7187,mul_762,call_function,mul.Tensor,backward,0,2,2,1,6114,6,8
-3194,einsum_default_197,call_function,einsum.default,backward,,2,2,1,3105,4,5
-3213,mul_203,call_function,mul.Tensor,backward,,2,2,1,3108,4,8
-3273,mul_215,call_function,mul.Tensor,backward,27,2,2,1,3154,4,8
-3354,mul_223,call_function,mul.Tensor,backward,27,2,2,1,3217,4,8
-3415,mul_235,call_function,mul.Tensor,backward,26,2,2,1,3261,4,8
-3496,mul_243,call_function,mul.Tensor,backward,26,2,2,1,3324,4,8
-3557,mul_255,call_function,mul.Tensor,backward,25,2,2,1,3368,4,8
-3638,mul_263,call_function,mul.Tensor,backward,25,2,2,1,3431,4,8
-3699,mul_275,call_function,mul.Tensor,backward,24,2,2,1,3475,4,8
-3780,mul_283,call_function,mul.Tensor,backward,24,2,2,1,3538,4,8
-3841,mul_295,call_function,mul.Tensor,backward,23,2,2,1,3582,4,8
-3922,mul_303,call_function,mul.Tensor,backward,23,2,2,1,3645,4,8
-3983,mul_315,call_function,mul.Tensor,backward,22,2,2,1,3689,4,8
-4064,mul_323,call_function,mul.Tensor,backward,22,2,2,1,3752,4,8
-4125,mul_335,call_function,mul.Tensor,backward,21,2,2,1,3796,4,8
-4206,mul_343,call_function,mul.Tensor,backward,21,2,2,1,3859,4,8
-4267,mul_355,call_function,mul.Tensor,backward,20,2,2,1,3903,4,8
-4348,mul_363,call_function,mul.Tensor,backward,20,2,2,1,3966,4,8
-4409,mul_375,call_function,mul.Tensor,backward,19,2,2,1,4010,4,8
-4490,mul_383,call_function,mul.Tensor,backward,19,2,2,1,4073,4,8
-4551,mul_395,call_function,mul.Tensor,backward,18,2,2,1,4117,4,8
-4632,mul_403,call_function,mul.Tensor,backward,18,2,2,1,4180,4,8
-4693,mul_415,call_function,mul.Tensor,backward,17,2,2,1,4224,4,8
-4774,mul_423,call_function,mul.Tensor,backward,17,2,2,1,4287,4,8
-4835,mul_435,call_function,mul.Tensor,backward,16,2,2,1,4331,4,8
-4916,mul_443,call_function,mul.Tensor,backward,16,2,2,1,4394,4,8
-4977,mul_455,call_function,mul.Tensor,backward,15,2,2,1,4438,4,8
-5058,mul_463,call_function,mul.Tensor,backward,15,2,2,1,4501,4,8
-5119,mul_475,call_function,mul.Tensor,backward,14,2,2,1,4545,4,8
-5200,mul_483,call_function,mul.Tensor,backward,14,2,2,1,4608,4,8
-5261,mul_495,call_function,mul.Tensor,backward,13,2,2,1,4652,4,8
-5342,mul_503,call_function,mul.Tensor,backward,13,2,2,1,4715,4,8
-5403,mul_515,call_function,mul.Tensor,backward,12,2,2,1,4759,4,8
-5484,mul_523,call_function,mul.Tensor,backward,12,2,2,1,4822,4,8
-5545,mul_535,call_function,mul.Tensor,backward,11,2,2,1,4866,4,8
-5626,mul_543,call_function,mul.Tensor,backward,11,2,2,1,4929,4,8
-5687,mul_555,call_function,mul.Tensor,backward,10,2,2,1,4973,4,8
-5768,mul_563,call_function,mul.Tensor,backward,10,2,2,1,5036,4,8
-5829,mul_575,call_function,mul.Tensor,backward,9,2,2,1,5080,4,8
-5910,mul_583,call_function,mul.Tensor,backward,9,2,2,1,5143,4,8
-5971,mul_595,call_function,mul.Tensor,backward,8,2,2,1,5187,4,8
-6052,mul_603,call_function,mul.Tensor,backward,8,2,2,1,5250,4,8
-6113,mul_615,call_function,mul.Tensor,backward,7,2,2,1,5294,4,8
-6194,mul_623,call_function,mul.Tensor,backward,7,2,2,1,5357,4,8
-6255,mul_635,call_function,mul.Tensor,backward,6,2,2,1,5401,4,8
-6336,mul_643,call_function,mul.Tensor,backward,6,2,2,1,5464,4,8
-6397,mul_655,call_function,mul.Tensor,backward,5,2,2,1,5508,4,8
-6478,mul_663,call_function,mul.Tensor,backward,5,2,2,1,5571,4,8
-6539,mul_675,call_function,mul.Tensor,backward,4,2,2,1,5615,4,8
-6620,mul_683,call_function,mul.Tensor,backward,4,2,2,1,5678,4,8
-6681,mul_695,call_function,mul.Tensor,backward,3,2,2,1,5722,4,8
-6762,mul_703,call_function,mul.Tensor,backward,3,2,2,1,5785,4,8
-6823,mul_715,call_function,mul.Tensor,backward,2,2,2,1,5829,4,8
-6904,mul_723,call_function,mul.Tensor,backward,2,2,2,1,5892,4,8
-6965,mul_735,call_function,mul.Tensor,backward,1,2,2,1,5936,4,8
-7046,mul_743,call_function,mul.Tensor,backward,1,2,2,1,5999,4,8
-7107,mul_755,call_function,mul.Tensor,backward,0,2,2,1,6043,4,8
-7188,mul_763,call_function,mul.Tensor,backward,0,2,2,1,6106,4,8
-7192,add_336,call_function,add.Tensor,unknown,,2,2,1,6116,4,10
-3220,einsum_default_199,call_function,einsum.default,backward,27,2,2,1,3122,3,5
-3230,einsum_default_201,call_function,einsum.default,backward,27,2,2,1,3127,3,5
-3252,einsum_default_203,call_function,einsum.default,backward,27,2,2,1,3142,3,5
-3281,einsum_default_205,call_function,einsum.default,backward,27,2,2,1,3166,3,5
-3318,einsum_default_207,call_function,einsum.default,backward,27,2,2,1,3178,3,5
-3325,einsum_default_209,call_function,einsum.default,backward,27,2,2,1,3187,3,5
-3333,einsum_default_211,call_function,einsum.default,backward,27,2,2,1,3187,3,5
-3362,einsum_default_213,call_function,einsum.default,backward,26,2,2,1,3229,3,5
-3372,einsum_default_215,call_function,einsum.default,backward,26,2,2,1,3234,3,5
-3394,einsum_default_217,call_function,einsum.default,backward,26,2,2,1,3249,3,5
-3423,einsum_default_219,call_function,einsum.default,backward,26,2,2,1,3273,3,5
-3460,einsum_default_221,call_function,einsum.default,backward,26,2,2,1,3285,3,5
-3467,einsum_default_223,call_function,einsum.default,backward,26,2,2,1,3294,3,5
-3475,einsum_default_225,call_function,einsum.default,backward,26,2,2,1,3294,3,5
-3504,einsum_default_227,call_function,einsum.default,backward,25,2,2,1,3336,3,5
-3514,einsum_default_229,call_function,einsum.default,backward,25,2,2,1,3341,3,5
-3536,einsum_default_231,call_function,einsum.default,backward,25,2,2,1,3356,3,5
-3565,einsum_default_233,call_function,einsum.default,backward,25,2,2,1,3380,3,5
-3602,einsum_default_235,call_function,einsum.default,backward,25,2,2,1,3392,3,5
-3609,einsum_default_237,call_function,einsum.default,backward,25,2,2,1,3401,3,5
-3617,einsum_default_239,call_function,einsum.default,backward,25,2,2,1,3401,3,5
-3646,einsum_default_241,call_function,einsum.default,backward,24,2,2,1,3443,3,5
-3656,einsum_default_243,call_function,einsum.default,backward,24,2,2,1,3448,3,5
-3678,einsum_default_245,call_function,einsum.default,backward,24,2,2,1,3463,3,5
-3707,einsum_default_247,call_function,einsum.default,backward,24,2,2,1,3487,3,5
-3744,einsum_default_249,call_function,einsum.default,backward,24,2,2,1,3499,3,5
-3751,einsum_default_251,call_function,einsum.default,backward,24,2,2,1,3508,3,5
-3759,einsum_default_253,call_function,einsum.default,backward,24,2,2,1,3508,3,5
-3788,einsum_default_255,call_function,einsum.default,backward,23,2,2,1,3550,3,5
-3798,einsum_default_257,call_function,einsum.default,backward,23,2,2,1,3555,3,5
-3820,einsum_default_259,call_function,einsum.default,backward,23,2,2,1,3570,3,5
-3849,einsum_default_261,call_function,einsum.default,backward,23,2,2,1,3594,3,5
-3886,einsum_default_263,call_function,einsum.default,backward,23,2,2,1,3606,3,5
-3893,einsum_default_265,call_function,einsum.default,backward,23,2,2,1,3615,3,5
-3901,einsum_default_267,call_function,einsum.default,backward,23,2,2,1,3615,3,5
-3930,einsum_default_269,call_function,einsum.default,backward,22,2,2,1,3657,3,5
-3940,einsum_default_271,call_function,einsum.default,backward,22,2,2,1,3662,3,5
-3962,einsum_default_273,call_function,einsum.default,backward,22,2,2,1,3677,3,5
-3991,einsum_default_275,call_function,einsum.default,backward,22,2,2,1,3701,3,5
-4028,einsum_default_277,call_function,einsum.default,backward,22,2,2,1,3713,3,5
-4035,einsum_default_279,call_function,einsum.default,backward,22,2,2,1,3722,3,5
-4043,einsum_default_281,call_function,einsum.default,backward,22,2,2,1,3722,3,5
-4072,einsum_default_283,call_function,einsum.default,backward,21,2,2,1,3764,3,5
-4082,einsum_default_285,call_function,einsum.default,backward,21,2,2,1,3769,3,5
-4104,einsum_default_287,call_function,einsum.default,backward,21,2,2,1,3784,3,5
-4133,einsum_default_289,call_function,einsum.default,backward,21,2,2,1,3808,3,5
-4170,einsum_default_291,call_function,einsum.default,backward,21,2,2,1,3820,3,5
-4177,einsum_default_293,call_function,einsum.default,backward,21,2,2,1,3829,3,5
-4185,einsum_default_295,call_function,einsum.default,backward,21,2,2,1,3829,3,5
-4214,einsum_default_297,call_function,einsum.default,backward,20,2,2,1,3871,3,5
-4224,einsum_default_299,call_function,einsum.default,backward,20,2,2,1,3876,3,5
-4246,einsum_default_301,call_function,einsum.default,backward,20,2,2,1,3891,3,5
-4275,einsum_default_303,call_function,einsum.default,backward,20,2,2,1,3915,3,5
-4312,einsum_default_305,call_function,einsum.default,backward,20,2,2,1,3927,3,5
-4319,einsum_default_307,call_function,einsum.default,backward,20,2,2,1,3936,3,5
-4327,einsum_default_309,call_function,einsum.default,backward,20,2,2,1,3936,3,5
-4356,einsum_default_311,call_function,einsum.default,backward,19,2,2,1,3978,3,5
-4366,einsum_default_313,call_function,einsum.default,backward,19,2,2,1,3983,3,5
-4388,einsum_default_315,call_function,einsum.default,backward,19,2,2,1,3998,3,5
-4417,einsum_default_317,call_function,einsum.default,backward,19,2,2,1,4022,3,5
-4454,einsum_default_319,call_function,einsum.default,backward,19,2,2,1,4034,3,5
-4461,einsum_default_321,call_function,einsum.default,backward,19,2,2,1,4043,3,5
-4469,einsum_default_323,call_function,einsum.default,backward,19,2,2,1,4043,3,5
-4498,einsum_default_325,call_function,einsum.default,backward,18,2,2,1,4085,3,5
-4508,einsum_default_327,call_function,einsum.default,backward,18,2,2,1,4090,3,5
-4530,einsum_default_329,call_function,einsum.default,backward,18,2,2,1,4105,3,5
-4559,einsum_default_331,call_function,einsum.default,backward,18,2,2,1,4129,3,5
-4596,einsum_default_333,call_function,einsum.default,backward,18,2,2,1,4141,3,5
-4603,einsum_default_335,call_function,einsum.default,backward,18,2,2,1,4150,3,5
-4611,einsum_default_337,call_function,einsum.default,backward,18,2,2,1,4150,3,5
-4640,einsum_default_339,call_function,einsum.default,backward,17,2,2,1,4192,3,5
-4650,einsum_default_341,call_function,einsum.default,backward,17,2,2,1,4197,3,5
-4672,einsum_default_343,call_function,einsum.default,backward,17,2,2,1,4212,3,5
-4701,einsum_default_345,call_function,einsum.default,backward,17,2,2,1,4236,3,5
-4738,einsum_default_347,call_function,einsum.default,backward,17,2,2,1,4248,3,5
-4745,einsum_default_349,call_function,einsum.default,backward,17,2,2,1,4257,3,5
-4753,einsum_default_351,call_function,einsum.default,backward,17,2,2,1,4257,3,5
-4782,einsum_default_353,call_function,einsum.default,backward,16,2,2,1,4299,3,5
-4792,einsum_default_355,call_function,einsum.default,backward,16,2,2,1,4304,3,5
-4814,einsum_default_357,call_function,einsum.default,backward,16,2,2,1,4319,3,5
-4843,einsum_default_359,call_function,einsum.default,backward,16,2,2,1,4343,3,5
-4880,einsum_default_361,call_function,einsum.default,backward,16,2,2,1,4355,3,5
-4887,einsum_default_363,call_function,einsum.default,backward,16,2,2,1,4364,3,5
-4895,einsum_default_365,call_function,einsum.default,backward,16,2,2,1,4364,3,5
-4924,einsum_default_367,call_function,einsum.default,backward,15,2,2,1,4406,3,5
-4934,einsum_default_369,call_function,einsum.default,backward,15,2,2,1,4411,3,5
-4956,einsum_default_371,call_function,einsum.default,backward,15,2,2,1,4426,3,5
-4985,einsum_default_373,call_function,einsum.default,backward,15,2,2,1,4450,3,5
-5022,einsum_default_375,call_function,einsum.default,backward,15,2,2,1,4462,3,5
-5029,einsum_default_377,call_function,einsum.default,backward,15,2,2,1,4471,3,5
-5037,einsum_default_379,call_function,einsum.default,backward,15,2,2,1,4471,3,5
-5066,einsum_default_381,call_function,einsum.default,backward,14,2,2,1,4513,3,5
-5076,einsum_default_383,call_function,einsum.default,backward,14,2,2,1,4518,3,5
-5098,einsum_default_385,call_function,einsum.default,backward,14,2,2,1,4533,3,5
-5127,einsum_default_387,call_function,einsum.default,backward,14,2,2,1,4557,3,5
-5164,einsum_default_389,call_function,einsum.default,backward,14,2,2,1,4569,3,5
-5171,einsum_default_391,call_function,einsum.default,backward,14,2,2,1,4578,3,5
-5179,einsum_default_393,call_function,einsum.default,backward,14,2,2,1,4578,3,5
-5208,einsum_default_395,call_function,einsum.default,backward,13,2,2,1,4620,3,5
-5218,einsum_default_397,call_function,einsum.default,backward,13,2,2,1,4625,3,5
-5240,einsum_default_399,call_function,einsum.default,backward,13,2,2,1,4640,3,5
-5269,einsum_default_401,call_function,einsum.default,backward,13,2,2,1,4664,3,5
-5306,einsum_default_403,call_function,einsum.default,backward,13,2,2,1,4676,3,5
-5313,einsum_default_405,call_function,einsum.default,backward,13,2,2,1,4685,3,5
-5321,einsum_default_407,call_function,einsum.default,backward,13,2,2,1,4685,3,5
-5350,einsum_default_409,call_function,einsum.default,backward,12,2,2,1,4727,3,5
-5360,einsum_default_411,call_function,einsum.default,backward,12,2,2,1,4732,3,5
-5382,einsum_default_413,call_function,einsum.default,backward,12,2,2,1,4747,3,5
-5411,einsum_default_415,call_function,einsum.default,backward,12,2,2,1,4771,3,5
-5448,einsum_default_417,call_function,einsum.default,backward,12,2,2,1,4783,3,5
-5455,einsum_default_419,call_function,einsum.default,backward,12,2,2,1,4792,3,5
-5463,einsum_default_421,call_function,einsum.default,backward,12,2,2,1,4792,3,5
-5492,einsum_default_423,call_function,einsum.default,backward,11,2,2,1,4834,3,5
-5502,einsum_default_425,call_function,einsum.default,backward,11,2,2,1,4839,3,5
-5524,einsum_default_427,call_function,einsum.default,backward,11,2,2,1,4854,3,5
-5553,einsum_default_429,call_function,einsum.default,backward,11,2,2,1,4878,3,5
-5590,einsum_default_431,call_function,einsum.default,backward,11,2,2,1,4890,3,5
-5597,einsum_default_433,call_function,einsum.default,backward,11,2,2,1,4899,3,5
-5605,einsum_default_435,call_function,einsum.default,backward,11,2,2,1,4899,3,5
-5634,einsum_default_437,call_function,einsum.default,backward,10,2,2,1,4941,3,5
-5644,einsum_default_439,call_function,einsum.default,backward,10,2,2,1,4946,3,5
-5666,einsum_default_441,call_function,einsum.default,backward,10,2,2,1,4961,3,5
-5695,einsum_default_443,call_function,einsum.default,backward,10,2,2,1,4985,3,5
-5732,einsum_default_445,call_function,einsum.default,backward,10,2,2,1,4997,3,5
-5739,einsum_default_447,call_function,einsum.default,backward,10,2,2,1,5006,3,5
-5747,einsum_default_449,call_function,einsum.default,backward,10,2,2,1,5006,3,5
-5776,einsum_default_451,call_function,einsum.default,backward,9,2,2,1,5048,3,5
-5786,einsum_default_453,call_function,einsum.default,backward,9,2,2,1,5053,3,5
-5808,einsum_default_455,call_function,einsum.default,backward,9,2,2,1,5068,3,5
-5837,einsum_default_457,call_function,einsum.default,backward,9,2,2,1,5092,3,5
-5874,einsum_default_459,call_function,einsum.default,backward,9,2,2,1,5104,3,5
-5881,einsum_default_461,call_function,einsum.default,backward,9,2,2,1,5113,3,5
-5889,einsum_default_463,call_function,einsum.default,backward,9,2,2,1,5113,3,5
-5918,einsum_default_465,call_function,einsum.default,backward,8,2,2,1,5155,3,5
-5928,einsum_default_467,call_function,einsum.default,backward,8,2,2,1,5160,3,5
-5950,einsum_default_469,call_function,einsum.default,backward,8,2,2,1,5175,3,5
-5979,einsum_default_471,call_function,einsum.default,backward,8,2,2,1,5199,3,5
-6016,einsum_default_473,call_function,einsum.default,backward,8,2,2,1,5211,3,5
-6023,einsum_default_475,call_function,einsum.default,backward,8,2,2,1,5220,3,5
-6031,einsum_default_477,call_function,einsum.default,backward,8,2,2,1,5220,3,5
-6060,einsum_default_479,call_function,einsum.default,backward,7,2,2,1,5262,3,5
-6070,einsum_default_481,call_function,einsum.default,backward,7,2,2,1,5267,3,5
-6092,einsum_default_483,call_function,einsum.default,backward,7,2,2,1,5282,3,5
-6121,einsum_default_485,call_function,einsum.default,backward,7,2,2,1,5306,3,5
-6158,einsum_default_487,call_function,einsum.default,backward,7,2,2,1,5318,3,5
-6165,einsum_default_489,call_function,einsum.default,backward,7,2,2,1,5327,3,5
-6173,einsum_default_491,call_function,einsum.default,backward,7,2,2,1,5327,3,5
-6202,einsum_default_493,call_function,einsum.default,backward,6,2,2,1,5369,3,5
-6212,einsum_default_495,call_function,einsum.default,backward,6,2,2,1,5374,3,5
-6234,einsum_default_497,call_function,einsum.default,backward,6,2,2,1,5389,3,5
-6263,einsum_default_499,call_function,einsum.default,backward,6,2,2,1,5413,3,5
-6300,einsum_default_501,call_function,einsum.default,backward,6,2,2,1,5425,3,5
-6307,einsum_default_503,call_function,einsum.default,backward,6,2,2,1,5434,3,5
-6315,einsum_default_505,call_function,einsum.default,backward,6,2,2,1,5434,3,5
-6344,einsum_default_507,call_function,einsum.default,backward,5,2,2,1,5476,3,5
-6354,einsum_default_509,call_function,einsum.default,backward,5,2,2,1,5481,3,5
-6376,einsum_default_511,call_function,einsum.default,backward,5,2,2,1,5496,3,5
-6405,einsum_default_513,call_function,einsum.default,backward,5,2,2,1,5520,3,5
-6442,einsum_default_515,call_function,einsum.default,backward,5,2,2,1,5532,3,5
-6449,einsum_default_517,call_function,einsum.default,backward,5,2,2,1,5541,3,5
-6457,einsum_default_519,call_function,einsum.default,backward,5,2,2,1,5541,3,5
-6486,einsum_default_521,call_function,einsum.default,backward,4,2,2,1,5583,3,5
-6496,einsum_default_523,call_function,einsum.default,backward,4,2,2,1,5588,3,5
-6518,einsum_default_525,call_function,einsum.default,backward,4,2,2,1,5603,3,5
-6547,einsum_default_527,call_function,einsum.default,backward,4,2,2,1,5627,3,5
-6584,einsum_default_529,call_function,einsum.default,backward,4,2,2,1,5639,3,5
-6591,einsum_default_531,call_function,einsum.default,backward,4,2,2,1,5648,3,5
-6599,einsum_default_533,call_function,einsum.default,backward,4,2,2,1,5648,3,5
-6628,einsum_default_535,call_function,einsum.default,backward,3,2,2,1,5690,3,5
-6638,einsum_default_537,call_function,einsum.default,backward,3,2,2,1,5695,3,5
-6660,einsum_default_539,call_function,einsum.default,backward,3,2,2,1,5710,3,5
-6689,einsum_default_541,call_function,einsum.default,backward,3,2,2,1,5734,3,5
-6726,einsum_default_543,call_function,einsum.default,backward,3,2,2,1,5746,3,5
-6733,einsum_default_545,call_function,einsum.default,backward,3,2,2,1,5755,3,5
-6741,einsum_default_547,call_function,einsum.default,backward,3,2,2,1,5755,3,5
-6770,einsum_default_549,call_function,einsum.default,backward,2,2,2,1,5797,3,5
-6780,einsum_default_551,call_function,einsum.default,backward,2,2,2,1,5802,3,5
-6802,einsum_default_553,call_function,einsum.default,backward,2,2,2,1,5817,3,5
-6831,einsum_default_555,call_function,einsum.default,backward,2,2,2,1,5841,3,5
-6868,einsum_default_557,call_function,einsum.default,backward,2,2,2,1,5853,3,5
-6875,einsum_default_559,call_function,einsum.default,backward,2,2,2,1,5862,3,5
-6883,einsum_default_561,call_function,einsum.default,backward,2,2,2,1,5862,3,5
-6912,einsum_default_563,call_function,einsum.default,backward,1,2,2,1,5904,3,5
-6922,einsum_default_565,call_function,einsum.default,backward,1,2,2,1,5909,3,5
-6944,einsum_default_567,call_function,einsum.default,backward,1,2,2,1,5924,3,5
-6973,einsum_default_569,call_function,einsum.default,backward,1,2,2,1,5948,3,5
-7010,einsum_default_571,call_function,einsum.default,backward,1,2,2,1,5960,3,5
-7017,einsum_default_573,call_function,einsum.default,backward,1,2,2,1,5969,3,5
-7025,einsum_default_575,call_function,einsum.default,backward,1,2,2,1,5969,3,5
-7054,einsum_default_577,call_function,einsum.default,backward,0,2,2,1,6011,3,5
-7064,einsum_default_579,call_function,einsum.default,backward,0,2,2,1,6016,3,5
-7086,einsum_default_581,call_function,einsum.default,backward,0,2,2,1,6031,3,5
-7115,einsum_default_583,call_function,einsum.default,backward,0,2,2,1,6055,3,5
-7152,einsum_default_585,call_function,einsum.default,backward,0,2,2,1,6067,3,5
-7159,einsum_default_587,call_function,einsum.default,backward,0,2,2,1,6076,3,5
-7167,einsum_default_589,call_function,einsum.default,backward,0,2,2,1,6076,3,5
-7195,embedding_dense_backward,call_function,embedding_dense_backward.default,backward,,2,2,1,6117,3,5
-3191,einsum_default_196,call_function,einsum.default,forward,,2,2,1,3106,1,5
-7197,add_337,call_function,add.Tensor,unknown,,2,2,1,6126,1,9
diff --git a/profile_results/real_llama3_by_mesh_dim.svg b/profile_results/real_llama3_by_mesh_dim.svg
deleted file mode 100644
index 6eb41508..00000000
--- a/profile_results/real_llama3_by_mesh_dim.svg
+++ /dev/null
@@ -1,167 +0,0 @@
-<svg xmlns="http://www.w3.org/2000/svg" width="1600" height="1000" viewBox="0 0 1600 1000">
-<rect width="100%" height="100%" fill="white"/>
-<text x="32" y="34" font-family="sans-serif" font-size="22" font-weight="700">Real Llama3 optimizer profile vs mesh dimension</text>
-<text x="32" y="58" font-family="sans-serif" font-size="12" fill="#475569">Y axes are log scale. Missing series points timed out or were not run.</text>
-<line x1="32" y1="84" x2="60" y2="84" stroke="#2563eb" stroke-width="3"/>
-<text x="68" y="88" font-family="sans-serif" font-size="12" fill="#334155">model_key=1B</text>
-<line x1="212" y1="84" x2="240" y2="84" stroke="#dc2626" stroke-width="3"/>
-<text x="248" y="88" font-family="sans-serif" font-size="12" fill="#334155">model_key=3B</text>
-<line x1="392" y1="84" x2="420" y2="84" stroke="#16a34a" stroke-width="3"/>
-<text x="428" y="88" font-family="sans-serif" font-size="12" fill="#334155">model_key=405B</text>
-<line x1="572" y1="84" x2="600" y2="84" stroke="#9333ea" stroke-width="3"/>
-<text x="608" y="88" font-family="sans-serif" font-size="12" fill="#334155">model_key=70B</text>
-<line x1="752" y1="84" x2="780" y2="84" stroke="#ea580c" stroke-width="3"/>
-<text x="788" y="88" font-family="sans-serif" font-size="12" fill="#334155">model_key=8B</text>
-<text x="62" y="106" font-family="sans-serif" font-size="14" font-weight="700">strategy enum (s)</text>
-<rect x="62" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="62" y1="300" x2="422" y2="300" stroke="#64748b"/>
-<line x1="62" y1="120" x2="62" y2="300" stroke="#64748b"/>
-<text x="12" y="132" font-family="sans-serif" font-size="10" fill="#64748b">9.9</text>
-<text x="12" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.54</text>
-<text x="46.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
-<text x="406.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
-<polyline points="62.0,286.0 422.0,130.3" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="62.0" cy="286.0" r="3.5" fill="#2563eb"/>
-<circle cx="422.0" cy="130.3" r="3.5" fill="#2563eb"/>
-<polyline points="62.0,300.0 422.0,120.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="62.0" cy="300.0" r="3.5" fill="#dc2626"/>
-<circle cx="422.0" cy="120.0" r="3.5" fill="#dc2626"/>
-<circle cx="62.0" cy="203.6" r="3.5" fill="#16a34a"/>
-<circle cx="62.0" cy="187.9" r="3.5" fill="#9333ea"/>
-<polyline points="62.0,265.4 422.0,124.3" fill="none" stroke="#ea580c" stroke-width="2.4"/>
-<circle cx="62.0" cy="265.4" r="3.5" fill="#ea580c"/>
-<circle cx="422.0" cy="124.3" r="3.5" fill="#ea580c"/>
-<text x="452" y="106" font-family="sans-serif" font-size="14" font-weight="700">cost estimation (s)</text>
-<rect x="452" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="452" y1="300" x2="812" y2="300" stroke="#64748b"/>
-<line x1="452" y1="120" x2="452" y2="300" stroke="#64748b"/>
-<text x="402" y="132" font-family="sans-serif" font-size="10" fill="#64748b">5</text>
-<text x="402" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.54</text>
-<text x="436.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
-<text x="796.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
-<polyline points="452.0,300.0 812.0,124.3" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="452.0" cy="300.0" r="3.5" fill="#2563eb"/>
-<circle cx="812.0" cy="124.3" r="3.5" fill="#2563eb"/>
-<polyline points="452.0,298.4 812.0,120.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="452.0" cy="298.4" r="3.5" fill="#dc2626"/>
-<circle cx="812.0" cy="120.0" r="3.5" fill="#dc2626"/>
-<circle cx="452.0" cy="299.1" r="3.5" fill="#16a34a"/>
-<circle cx="452.0" cy="281.4" r="3.5" fill="#9333ea"/>
-<polyline points="452.0,297.8 812.0,124.5" fill="none" stroke="#ea580c" stroke-width="2.4"/>
-<circle cx="452.0" cy="297.8" r="3.5" fill="#ea580c"/>
-<circle cx="812.0" cy="124.5" r="3.5" fill="#ea580c"/>
-<text x="842" y="106" font-family="sans-serif" font-size="14" font-weight="700">ILP construction (s)</text>
-<rect x="842" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="842" y1="300" x2="1202" y2="300" stroke="#64748b"/>
-<line x1="842" y1="120" x2="842" y2="300" stroke="#64748b"/>
-<text x="792" y="132" font-family="sans-serif" font-size="10" fill="#64748b">14</text>
-<text x="792" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.26</text>
-<text x="826.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
-<text x="1186.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
-<polyline points="842.0,300.0 1202.0,134.3" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="842.0" cy="300.0" r="3.5" fill="#2563eb"/>
-<circle cx="1202.0" cy="134.3" r="3.5" fill="#2563eb"/>
-<polyline points="842.0,292.0 1202.0,120.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="842.0" cy="292.0" r="3.5" fill="#dc2626"/>
-<circle cx="1202.0" cy="120.0" r="3.5" fill="#dc2626"/>
-<circle cx="842.0" cy="208.5" r="3.5" fill="#16a34a"/>
-<circle cx="842.0" cy="203.9" r="3.5" fill="#9333ea"/>
-<polyline points="842.0,254.2 1202.0,129.6" fill="none" stroke="#ea580c" stroke-width="2.4"/>
-<circle cx="842.0" cy="254.2" r="3.5" fill="#ea580c"/>
-<circle cx="1202.0" cy="129.6" r="3.5" fill="#ea580c"/>
-<text x="1232" y="106" font-family="sans-serif" font-size="14" font-weight="700">objective build (s)</text>
-<rect x="1232" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="1232" y1="300" x2="1592" y2="300" stroke="#64748b"/>
-<line x1="1232" y1="120" x2="1232" y2="300" stroke="#64748b"/>
-<text x="1182" y="132" font-family="sans-serif" font-size="10" fill="#64748b">3.3</text>
-<text x="1182" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.053</text>
-<text x="1216.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
-<text x="1576.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
-<polyline points="1232.0,300.0 1592.0,138.1" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="1232.0" cy="300.0" r="3.5" fill="#2563eb"/>
-<circle cx="1592.0" cy="138.1" r="3.5" fill="#2563eb"/>
-<polyline points="1232.0,295.0 1592.0,133.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="1232.0" cy="295.0" r="3.5" fill="#dc2626"/>
-<circle cx="1592.0" cy="133.0" r="3.5" fill="#dc2626"/>
-<circle cx="1232.0" cy="288.4" r="3.5" fill="#16a34a"/>
-<circle cx="1232.0" cy="286.3" r="3.5" fill="#9333ea"/>
-<polyline points="1232.0,295.7 1592.0,120.0" fill="none" stroke="#ea580c" stroke-width="2.4"/>
-<circle cx="1232.0" cy="295.7" r="3.5" fill="#ea580c"/>
-<circle cx="1592.0" cy="120.0" r="3.5" fill="#ea580c"/>
-<text x="62" y="336" font-family="sans-serif" font-size="14" font-weight="700">solve (s)</text>
-<rect x="62" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="62" y1="530" x2="422" y2="530" stroke="#64748b"/>
-<line x1="62" y1="350" x2="62" y2="530" stroke="#64748b"/>
-<text x="12" y="362" font-family="sans-serif" font-size="10" fill="#64748b">86</text>
-<text x="12" y="530" font-family="sans-serif" font-size="10" fill="#64748b">0.49</text>
-<text x="46.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
-<text x="406.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
-<polyline points="62.0,530.0 422.0,352.4" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="62.0" cy="530.0" r="3.5" fill="#2563eb"/>
-<circle cx="422.0" cy="352.4" r="3.5" fill="#2563eb"/>
-<polyline points="62.0,523.9 422.0,353.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="62.0" cy="523.9" r="3.5" fill="#dc2626"/>
-<circle cx="422.0" cy="353.0" r="3.5" fill="#dc2626"/>
-<circle cx="62.0" cy="472.4" r="3.5" fill="#16a34a"/>
-<circle cx="62.0" cy="490.5" r="3.5" fill="#9333ea"/>
-<polyline points="62.0,523.9 422.0,350.0" fill="none" stroke="#ea580c" stroke-width="2.4"/>
-<circle cx="62.0" cy="523.9" r="3.5" fill="#ea580c"/>
-<circle cx="422.0" cy="350.0" r="3.5" fill="#ea580c"/>
-<text x="452" y="336" font-family="sans-serif" font-size="14" font-weight="700">pipeline total (s)</text>
-<rect x="452" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="452" y1="530" x2="812" y2="530" stroke="#64748b"/>
-<line x1="452" y1="350" x2="452" y2="530" stroke="#64748b"/>
-<text x="402" y="362" font-family="sans-serif" font-size="10" fill="#64748b">124</text>
-<text x="402" y="530" font-family="sans-serif" font-size="10" fill="#64748b">3</text>
-<text x="436.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
-<text x="796.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
-<polyline points="452.0,530.0 812.0,356.0" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="452.0" cy="530.0" r="3.5" fill="#2563eb"/>
-<circle cx="812.0" cy="356.0" r="3.5" fill="#2563eb"/>
-<polyline points="452.0,514.5 812.0,352.1" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="452.0" cy="514.5" r="3.5" fill="#dc2626"/>
-<circle cx="812.0" cy="352.1" r="3.5" fill="#dc2626"/>
-<circle cx="452.0" cy="419.0" r="3.5" fill="#16a34a"/>
-<circle cx="452.0" cy="438.3" r="3.5" fill="#9333ea"/>
-<polyline points="452.0,508.1 812.0,350.0" fill="none" stroke="#ea580c" stroke-width="2.4"/>
-<circle cx="452.0" cy="508.1" r="3.5" fill="#ea580c"/>
-<circle cx="812.0" cy="350.0" r="3.5" fill="#ea580c"/>
-<text x="842" y="336" font-family="sans-serif" font-size="14" font-weight="700">unique ILP vars</text>
-<rect x="842" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="842" y1="530" x2="1202" y2="530" stroke="#64748b"/>
-<line x1="842" y1="350" x2="842" y2="530" stroke="#64748b"/>
-<text x="792" y="362" font-family="sans-serif" font-size="10" fill="#64748b">488.5K</text>
-<text x="792" y="530" font-family="sans-serif" font-size="10" fill="#64748b">13.0K</text>
-<text x="826.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
-<text x="1186.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
-<polyline points="842.0,530.0 1202.0,350.6" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="842.0" cy="530.0" r="3.5" fill="#2563eb"/>
-<circle cx="1202.0" cy="350.6" r="3.5" fill="#2563eb"/>
-<polyline points="842.0,526.8 1202.0,350.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="842.0" cy="526.8" r="3.5" fill="#dc2626"/>
-<circle cx="1202.0" cy="350.0" r="3.5" fill="#dc2626"/>
-<circle cx="842.0" cy="515.7" r="3.5" fill="#16a34a"/>
-<circle cx="842.0" cy="520.2" r="3.5" fill="#9333ea"/>
-<polyline points="842.0,526.6 1202.0,350.1" fill="none" stroke="#ea580c" stroke-width="2.4"/>
-<circle cx="842.0" cy="526.6" r="3.5" fill="#ea580c"/>
-<circle cx="1202.0" cy="350.1" r="3.5" fill="#ea580c"/>
-<text x="1232" y="336" font-family="sans-serif" font-size="14" font-weight="700">constraints</text>
-<rect x="1232" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="1232" y1="530" x2="1592" y2="530" stroke="#64748b"/>
-<line x1="1232" y1="350" x2="1232" y2="530" stroke="#64748b"/>
-<text x="1182" y="362" font-family="sans-serif" font-size="10" fill="#64748b">177.2K</text>
-<text x="1182" y="530" font-family="sans-serif" font-size="10" fill="#64748b">7.0K</text>
-<text x="1216.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1</text>
-<text x="1576.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">2</text>
-<polyline points="1232.0,530.0 1592.0,351.3" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="1232.0" cy="530.0" r="3.5" fill="#2563eb"/>
-<circle cx="1592.0" cy="351.3" r="3.5" fill="#2563eb"/>
-<polyline points="1232.0,522.3 1592.0,350.2" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="1232.0" cy="522.3" r="3.5" fill="#dc2626"/>
-<circle cx="1592.0" cy="350.2" r="3.5" fill="#dc2626"/>
-<circle cx="1232.0" cy="486.0" r="3.5" fill="#16a34a"/>
-<circle cx="1232.0" cy="500.0" r="3.5" fill="#9333ea"/>
-<polyline points="1232.0,520.3 1592.0,350.0" fill="none" stroke="#ea580c" stroke-width="2.4"/>
-<circle cx="1232.0" cy="520.3" r="3.5" fill="#ea580c"/>
-<circle cx="1592.0" cy="350.0" r="3.5" fill="#ea580c"/>
-</svg>
\ No newline at end of file
diff --git a/profile_results/real_llama3_by_model_size.svg b/profile_results/real_llama3_by_model_size.svg
deleted file mode 100644
index 11fabae2..00000000
--- a/profile_results/real_llama3_by_model_size.svg
+++ /dev/null
@@ -1,177 +0,0 @@
-<svg xmlns="http://www.w3.org/2000/svg" width="1600" height="1000" viewBox="0 0 1600 1000">
-<rect width="100%" height="100%" fill="white"/>
-<text x="32" y="34" font-family="sans-serif" font-size="22" font-weight="700">Real Llama3 optimizer profile vs model size</text>
-<text x="32" y="58" font-family="sans-serif" font-size="12" fill="#475569">Y axes are log scale. Missing series points timed out or were not run.</text>
-<line x1="32" y1="84" x2="60" y2="84" stroke="#2563eb" stroke-width="3"/>
-<text x="68" y="88" font-family="sans-serif" font-size="12" fill="#334155">mesh_ndim=1</text>
-<line x1="212" y1="84" x2="240" y2="84" stroke="#dc2626" stroke-width="3"/>
-<text x="248" y="88" font-family="sans-serif" font-size="12" fill="#334155">mesh_ndim=2</text>
-<text x="62" y="106" font-family="sans-serif" font-size="14" font-weight="700">strategy enum (s)</text>
-<rect x="62" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="62" y1="300" x2="422" y2="300" stroke="#64748b"/>
-<line x1="62" y1="120" x2="62" y2="300" stroke="#64748b"/>
-<text x="12" y="132" font-family="sans-serif" font-size="10" fill="#64748b">9.9</text>
-<text x="12" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.54</text>
-<text x="46.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
-<text x="47.75893682743604" y="318" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
-<text x="52.04521657000967" y="318" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
-<text x="107.67414055497832" y="318" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
-<text x="406.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
-<polyline points="62.0,286.0 63.8,300.0 68.0,265.4 123.7,187.9 422.0,203.6" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="62.0" cy="286.0" r="3.5" fill="#2563eb"/>
-<circle cx="63.8" cy="300.0" r="3.5" fill="#2563eb"/>
-<circle cx="68.0" cy="265.4" r="3.5" fill="#2563eb"/>
-<circle cx="123.7" cy="187.9" r="3.5" fill="#2563eb"/>
-<circle cx="422.0" cy="203.6" r="3.5" fill="#2563eb"/>
-<polyline points="62.0,130.3 63.8,120.0 68.0,124.3" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="62.0" cy="130.3" r="3.5" fill="#dc2626"/>
-<circle cx="63.8" cy="120.0" r="3.5" fill="#dc2626"/>
-<circle cx="68.0" cy="124.3" r="3.5" fill="#dc2626"/>
-<text x="452" y="106" font-family="sans-serif" font-size="14" font-weight="700">cost estimation (s)</text>
-<rect x="452" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="452" y1="300" x2="812" y2="300" stroke="#64748b"/>
-<line x1="452" y1="120" x2="452" y2="300" stroke="#64748b"/>
-<text x="402" y="132" font-family="sans-serif" font-size="10" fill="#64748b">5</text>
-<text x="402" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.54</text>
-<text x="436.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
-<text x="437.75893682743606" y="318" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
-<text x="442.04521657000964" y="318" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
-<text x="497.6741405549783" y="318" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
-<text x="796.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
-<polyline points="452.0,300.0 453.8,298.4 458.0,297.8 513.7,281.4 812.0,299.1" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="452.0" cy="300.0" r="3.5" fill="#2563eb"/>
-<circle cx="453.8" cy="298.4" r="3.5" fill="#2563eb"/>
-<circle cx="458.0" cy="297.8" r="3.5" fill="#2563eb"/>
-<circle cx="513.7" cy="281.4" r="3.5" fill="#2563eb"/>
-<circle cx="812.0" cy="299.1" r="3.5" fill="#2563eb"/>
-<polyline points="452.0,124.3 453.8,120.0 458.0,124.5" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="452.0" cy="124.3" r="3.5" fill="#dc2626"/>
-<circle cx="453.8" cy="120.0" r="3.5" fill="#dc2626"/>
-<circle cx="458.0" cy="124.5" r="3.5" fill="#dc2626"/>
-<text x="842" y="106" font-family="sans-serif" font-size="14" font-weight="700">ILP construction (s)</text>
-<rect x="842" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="842" y1="300" x2="1202" y2="300" stroke="#64748b"/>
-<line x1="842" y1="120" x2="842" y2="300" stroke="#64748b"/>
-<text x="792" y="132" font-family="sans-serif" font-size="10" fill="#64748b">14</text>
-<text x="792" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.26</text>
-<text x="826.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
-<text x="827.7589368274361" y="318" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
-<text x="832.0452165700096" y="318" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
-<text x="887.6741405549783" y="318" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
-<text x="1186.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
-<polyline points="842.0,300.0 843.8,292.0 848.0,254.2 903.7,203.9 1202.0,208.5" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="842.0" cy="300.0" r="3.5" fill="#2563eb"/>
-<circle cx="843.8" cy="292.0" r="3.5" fill="#2563eb"/>
-<circle cx="848.0" cy="254.2" r="3.5" fill="#2563eb"/>
-<circle cx="903.7" cy="203.9" r="3.5" fill="#2563eb"/>
-<circle cx="1202.0" cy="208.5" r="3.5" fill="#2563eb"/>
-<polyline points="842.0,134.3 843.8,120.0 848.0,129.6" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="842.0" cy="134.3" r="3.5" fill="#dc2626"/>
-<circle cx="843.8" cy="120.0" r="3.5" fill="#dc2626"/>
-<circle cx="848.0" cy="129.6" r="3.5" fill="#dc2626"/>
-<text x="1232" y="106" font-family="sans-serif" font-size="14" font-weight="700">objective build (s)</text>
-<rect x="1232" y="120" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="1232" y1="300" x2="1592" y2="300" stroke="#64748b"/>
-<line x1="1232" y1="120" x2="1232" y2="300" stroke="#64748b"/>
-<text x="1182" y="132" font-family="sans-serif" font-size="10" fill="#64748b">3.3</text>
-<text x="1182" y="300" font-family="sans-serif" font-size="10" fill="#64748b">0.053</text>
-<text x="1216.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
-<text x="1217.758936827436" y="318" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
-<text x="1222.0452165700096" y="318" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
-<text x="1277.6741405549783" y="318" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
-<text x="1576.0" y="318" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
-<polyline points="1232.0,300.0 1233.8,295.0 1238.0,295.7 1293.7,286.3 1592.0,288.4" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="1232.0" cy="300.0" r="3.5" fill="#2563eb"/>
-<circle cx="1233.8" cy="295.0" r="3.5" fill="#2563eb"/>
-<circle cx="1238.0" cy="295.7" r="3.5" fill="#2563eb"/>
-<circle cx="1293.7" cy="286.3" r="3.5" fill="#2563eb"/>
-<circle cx="1592.0" cy="288.4" r="3.5" fill="#2563eb"/>
-<polyline points="1232.0,138.1 1233.8,133.0 1238.0,120.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="1232.0" cy="138.1" r="3.5" fill="#dc2626"/>
-<circle cx="1233.8" cy="133.0" r="3.5" fill="#dc2626"/>
-<circle cx="1238.0" cy="120.0" r="3.5" fill="#dc2626"/>
-<text x="62" y="336" font-family="sans-serif" font-size="14" font-weight="700">solve (s)</text>
-<rect x="62" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="62" y1="530" x2="422" y2="530" stroke="#64748b"/>
-<line x1="62" y1="350" x2="62" y2="530" stroke="#64748b"/>
-<text x="12" y="362" font-family="sans-serif" font-size="10" fill="#64748b">86</text>
-<text x="12" y="530" font-family="sans-serif" font-size="10" fill="#64748b">0.49</text>
-<text x="46.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
-<text x="47.75893682743604" y="548" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
-<text x="52.04521657000967" y="548" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
-<text x="107.67414055497832" y="548" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
-<text x="406.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
-<polyline points="62.0,530.0 63.8,523.9 68.0,523.9 123.7,490.5 422.0,472.4" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="62.0" cy="530.0" r="3.5" fill="#2563eb"/>
-<circle cx="63.8" cy="523.9" r="3.5" fill="#2563eb"/>
-<circle cx="68.0" cy="523.9" r="3.5" fill="#2563eb"/>
-<circle cx="123.7" cy="490.5" r="3.5" fill="#2563eb"/>
-<circle cx="422.0" cy="472.4" r="3.5" fill="#2563eb"/>
-<polyline points="62.0,352.4 63.8,353.0 68.0,350.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="62.0" cy="352.4" r="3.5" fill="#dc2626"/>
-<circle cx="63.8" cy="353.0" r="3.5" fill="#dc2626"/>
-<circle cx="68.0" cy="350.0" r="3.5" fill="#dc2626"/>
-<text x="452" y="336" font-family="sans-serif" font-size="14" font-weight="700">pipeline total (s)</text>
-<rect x="452" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="452" y1="530" x2="812" y2="530" stroke="#64748b"/>
-<line x1="452" y1="350" x2="452" y2="530" stroke="#64748b"/>
-<text x="402" y="362" font-family="sans-serif" font-size="10" fill="#64748b">124</text>
-<text x="402" y="530" font-family="sans-serif" font-size="10" fill="#64748b">3</text>
-<text x="436.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
-<text x="437.75893682743606" y="548" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
-<text x="442.04521657000964" y="548" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
-<text x="497.6741405549783" y="548" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
-<text x="796.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
-<polyline points="452.0,530.0 453.8,514.5 458.0,508.1 513.7,438.3 812.0,419.0" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="452.0" cy="530.0" r="3.5" fill="#2563eb"/>
-<circle cx="453.8" cy="514.5" r="3.5" fill="#2563eb"/>
-<circle cx="458.0" cy="508.1" r="3.5" fill="#2563eb"/>
-<circle cx="513.7" cy="438.3" r="3.5" fill="#2563eb"/>
-<circle cx="812.0" cy="419.0" r="3.5" fill="#2563eb"/>
-<polyline points="452.0,356.0 453.8,352.1 458.0,350.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="452.0" cy="356.0" r="3.5" fill="#dc2626"/>
-<circle cx="453.8" cy="352.1" r="3.5" fill="#dc2626"/>
-<circle cx="458.0" cy="350.0" r="3.5" fill="#dc2626"/>
-<text x="842" y="336" font-family="sans-serif" font-size="14" font-weight="700">unique ILP vars</text>
-<rect x="842" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="842" y1="530" x2="1202" y2="530" stroke="#64748b"/>
-<line x1="842" y1="350" x2="842" y2="530" stroke="#64748b"/>
-<text x="792" y="362" font-family="sans-serif" font-size="10" fill="#64748b">488.5K</text>
-<text x="792" y="530" font-family="sans-serif" font-size="10" fill="#64748b">13.0K</text>
-<text x="826.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
-<text x="827.7589368274361" y="548" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
-<text x="832.0452165700096" y="548" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
-<text x="887.6741405549783" y="548" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
-<text x="1186.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
-<polyline points="842.0,530.0 843.8,526.8 848.0,526.6 903.7,520.2 1202.0,515.7" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="842.0" cy="530.0" r="3.5" fill="#2563eb"/>
-<circle cx="843.8" cy="526.8" r="3.5" fill="#2563eb"/>
-<circle cx="848.0" cy="526.6" r="3.5" fill="#2563eb"/>
-<circle cx="903.7" cy="520.2" r="3.5" fill="#2563eb"/>
-<circle cx="1202.0" cy="515.7" r="3.5" fill="#2563eb"/>
-<polyline points="842.0,350.6 843.8,350.0 848.0,350.1" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="842.0" cy="350.6" r="3.5" fill="#dc2626"/>
-<circle cx="843.8" cy="350.0" r="3.5" fill="#dc2626"/>
-<circle cx="848.0" cy="350.1" r="3.5" fill="#dc2626"/>
-<text x="1232" y="336" font-family="sans-serif" font-size="14" font-weight="700">constraints</text>
-<rect x="1232" y="350" width="360" height="180" fill="#f8fafc" stroke="#cbd5e1"/>
-<line x1="1232" y1="530" x2="1592" y2="530" stroke="#64748b"/>
-<line x1="1232" y1="350" x2="1232" y2="530" stroke="#64748b"/>
-<text x="1182" y="362" font-family="sans-serif" font-size="10" fill="#64748b">177.2K</text>
-<text x="1182" y="530" font-family="sans-serif" font-size="10" fill="#64748b">7.0K</text>
-<text x="1216.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">1.2</text>
-<text x="1217.758936827436" y="548" font-family="sans-serif" font-size="10" fill="#64748b">3.2</text>
-<text x="1222.0452165700096" y="548" font-family="sans-serif" font-size="10" fill="#64748b">8</text>
-<text x="1277.6741405549783" y="548" font-family="sans-serif" font-size="10" fill="#64748b">71</text>
-<text x="1576.0" y="548" font-family="sans-serif" font-size="10" fill="#64748b">406</text>
-<polyline points="1232.0,530.0 1233.8,522.3 1238.0,520.3 1293.7,500.0 1592.0,486.0" fill="none" stroke="#2563eb" stroke-width="2.4"/>
-<circle cx="1232.0" cy="530.0" r="3.5" fill="#2563eb"/>
-<circle cx="1233.8" cy="522.3" r="3.5" fill="#2563eb"/>
-<circle cx="1238.0" cy="520.3" r="3.5" fill="#2563eb"/>
-<circle cx="1293.7" cy="500.0" r="3.5" fill="#2563eb"/>
-<circle cx="1592.0" cy="486.0" r="3.5" fill="#2563eb"/>
-<polyline points="1232.0,351.3 1233.8,350.2 1238.0,350.0" fill="none" stroke="#dc2626" stroke-width="2.4"/>
-<circle cx="1232.0" cy="351.3" r="3.5" fill="#dc2626"/>
-<circle cx="1233.8" cy="350.2" r="3.5" fill="#dc2626"/>
-<circle cx="1238.0" cy="350.0" r="3.5" fill="#dc2626"/>
-</svg>
\ No newline at end of file
diff --git a/profile_results/real_llama3_dag_analysis.py b/profile_results/real_llama3_dag_analysis.py
deleted file mode 100644
index 03b445a3..00000000
--- a/profile_results/real_llama3_dag_analysis.py
+++ /dev/null
@@ -1,255 +0,0 @@
-import csv
-import json
-import logging
-import re
-import sys
-import time
-from collections import Counter, defaultdict
-from pathlib import Path
-
-import networkx as nx
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-sys.path.insert(0, "/home/wangkj/workspace/torchtitan")
-
-from torchtitan.models.llama3 import llama3_configs  # noqa: E402
-
-from autoparallel.api import AutoParallel
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-
-WORLD_SIZE = 64
-SEQ_LEN = 256
-GLOBAL_BATCH = 64
-
-
-def init_dist():
-    if not torch.distributed.is_initialized():
-        torch.distributed.init_process_group(
-            "fake", store=FakeStore(), rank=0, world_size=WORLD_SIZE
-        )
-
-
-def target_name(node):
-    target = node.target
-    if hasattr(target, "__name__"):
-        return target.__name__
-    return str(target)
-
-
-def layer_id(node):
-    stacks = []
-    for key in ("nn_module_stack", "fwd_nn_module_stack"):
-        value = node.meta.get(key)
-        if value:
-            stacks.append(str(value))
-    text = " ".join(stacks)
-    match = re.search(r"layers[._']+([0-9]+)", text)
-    return int(match.group(1)) if match else ""
-
-
-def phase(node):
-    if "fwd_nn_module_stack" in node.meta:
-        return "backward"
-    if "nn_module_stack" in node.meta:
-        return "forward"
-    if node.op == "placeholder" and str(node.name).startswith("tangents"):
-        return "backward"
-    return "unknown"
-
-
-def bitset_counts(nodes, edges):
-    idx = {node: i for i, node in enumerate(nodes)}
-    children = [[] for _ in nodes]
-    parents = [[] for _ in nodes]
-    for src, dst in edges:
-        children[idx[src]].append(idx[dst])
-        parents[idx[dst]].append(idx[src])
-
-    descendants = [0] * len(nodes)
-    for i in range(len(nodes) - 1, -1, -1):
-        bits = 0
-        for child in children[i]:
-            bits |= 1 << child
-            bits |= descendants[child]
-        descendants[i] = bits
-
-    ancestors = [0] * len(nodes)
-    for i in range(len(nodes)):
-        bits = 0
-        for parent in parents[i]:
-            bits |= 1 << parent
-            bits |= ancestors[parent]
-        ancestors[i] = bits
-
-    return (
-        [bits.bit_count() for bits in ancestors],
-        [bits.bit_count() for bits in descendants],
-    )
-
-
-def treewidth_upper_bounds(edges):
-    graph = nx.Graph()
-    graph.add_edges_from(edges)
-    width_min_fill, _ = nx.approximation.treewidth_min_fill_in(graph)
-    width_min_degree, _ = nx.approximation.treewidth_min_degree(graph)
-
-    moral = graph.copy()
-    parents_by_child = defaultdict(list)
-    for src, dst in edges:
-        parents_by_child[dst].append(src)
-    for parents in parents_by_child.values():
-        for i, left in enumerate(parents):
-            for right in parents[i + 1 :]:
-                moral.add_edge(left, right)
-    moral_width_min_fill, _ = nx.approximation.treewidth_min_fill_in(moral)
-    moral_width_min_degree, _ = nx.approximation.treewidth_min_degree(moral)
-    return {
-        "undirected_min_fill": width_min_fill,
-        "undirected_min_degree": width_min_degree,
-        "moralized_min_fill": moral_width_min_fill,
-        "moralized_min_degree": moral_width_min_degree,
-        "undirected_edges": graph.number_of_edges(),
-        "moralized_edges": moral.number_of_edges(),
-    }
-
-
-def run_analysis(out_dir):
-    init_dist()
-    mesh = torch.distributed.device_mesh.init_device_mesh(
-        "cuda", (64,), mesh_dim_names=("dp",)
-    )
-    set_nccl_topo_config(detect_nccl_topo_config(mesh))
-
-    config = llama3_configs["3B"](attn_backend="sdpa")
-    config.rope.max_seq_len = SEQ_LEN
-    with torch.device("meta"):
-        model = config.build()
-
-    def input_fn():
-        return torch.randint(0, config.vocab_size, (GLOBAL_BATCH, SEQ_LEN), device="cuda")
-
-    mp_policy = MixedPrecisionPolicy(
-        param_dtype=torch.bfloat16, reduce_dtype=torch.float32
-    )
-    t0 = time.perf_counter()
-    with AutoParallel(
-        model, input_fn, mesh, mp_policy, repeated_subgraphs=True
-    ) as autop:
-        autop.add_parameter_memory_constraint(low=None, high=None)
-        autop.add_input_constraints([(Shard(0),)])
-        autop.add_output_constraints([(Shard(0),)])
-        opt = autop.sharding_optimizer
-
-        ilp_nodes = [node for node in opt.nodes if node.op != "output"]
-        ilp_node_set = set(ilp_nodes)
-        edges = []
-        dep_args = {}
-        dep_unique = {}
-        for node in ilp_nodes:
-            inputs = [inp for inp in opt._all_input_nodes(node) if inp in ilp_node_set]
-            dep_args[node] = len(inputs)
-            dep_unique[node] = len(set(inputs))
-            for inp in set(inputs):
-                edges.append((inp, node))
-
-        offspring = Counter()
-        for src, _dst in edges:
-            offspring[src] += 1
-
-        ancestor_counts, descendant_counts = bitset_counts(ilp_nodes, edges)
-        node_to_idx = {node: i for i, node in enumerate(ilp_nodes)}
-        treewidth = treewidth_upper_bounds(edges)
-
-        rows = []
-        for node in ilp_nodes:
-            idx = node_to_idx[node]
-            rows.append(
-                {
-                    "idx": idx,
-                    "name": node.name,
-                    "op": node.op,
-                    "target": target_name(node),
-                    "phase": phase(node),
-                    "layer": layer_id(node),
-                    "direct_dependency_args": dep_args[node],
-                    "direct_dependency_nodes": dep_unique[node],
-                    "direct_offspring_nodes": offspring[node],
-                    "ancestor_count": ancestor_counts[idx],
-                    "descendant_count": descendant_counts[idx],
-                    "strategy_count": len(opt.strats[node].strategies),
-                }
-            )
-
-        merge_points = [
-            row for row in rows if int(row["direct_dependency_nodes"]) > 1
-        ]
-        merge_points.sort(
-            key=lambda row: (
-                -int(row["direct_dependency_nodes"]),
-                -int(row["descendant_count"]),
-                int(row["idx"]),
-            )
-        )
-        fanout_points = sorted(
-            rows,
-            key=lambda row: (-int(row["direct_offspring_nodes"]), int(row["idx"])),
-        )
-
-    out_dir = Path(out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    node_csv = out_dir / "real_llama3_3b_dag_node_stats.csv"
-    with node_csv.open("w", newline="") as f:
-        writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
-        writer.writeheader()
-        writer.writerows(rows)
-
-    merge_csv = out_dir / "real_llama3_3b_merge_points.csv"
-    with merge_csv.open("w", newline="") as f:
-        writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
-        writer.writeheader()
-        writer.writerows(merge_points)
-
-    summary = {
-        "model": "LLaMA3 3B",
-        "mesh": "1D 64",
-        "trace_and_optimizer_build_s": time.perf_counter() - t0,
-        "ilp_nodes": len(ilp_nodes),
-        "dag_edges": len(edges),
-        "merge_points": len(merge_points),
-        "branch_points": sum(1 for row in rows if int(row["direct_offspring_nodes"]) > 1),
-        "max_direct_dependency_nodes": max(int(row["direct_dependency_nodes"]) for row in rows),
-        "max_direct_offspring_nodes": max(int(row["direct_offspring_nodes"]) for row in rows),
-        "max_ancestor_count": max(int(row["ancestor_count"]) for row in rows),
-        "max_descendant_count": max(int(row["descendant_count"]) for row in rows),
-        "treewidth_upper_bounds": treewidth,
-        "direct_dependency_histogram": dict(
-            sorted(Counter(int(row["direct_dependency_nodes"]) for row in rows).items())
-        ),
-        "direct_offspring_histogram": dict(
-            sorted(Counter(int(row["direct_offspring_nodes"]) for row in rows).items())
-        ),
-        "top_merge_points": merge_points[:30],
-        "top_fanout_points": fanout_points[:30],
-        "node_stats_csv": str(node_csv),
-        "merge_points_csv": str(merge_csv),
-    }
-    summary_path = out_dir / "real_llama3_3b_dag_summary.json"
-    summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True))
-    print(json.dumps(summary, indent=2, sort_keys=True))
-
-
-def main():
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
-    )
-    run_analysis("profile_results")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/profile_results/real_llama3_optimizer_presolve_3d4d.log b/profile_results/real_llama3_optimizer_presolve_3d4d.log
deleted file mode 100644
index 923ec1f1..00000000
--- a/profile_results/real_llama3_optimizer_presolve_3d4d.log
+++ /dev/null
@@ -1,7 +0,0 @@
-[14:50:20] start model=1B mesh_ndim=3 skip_solve timeout=1200s
-2026-05-26 14:50:29,648 INFO:autoparallel.api:Graph tracing took 6.073s
-2026-05-26 14:58:18,227 INFO:autoparallel.optimize_sharding:ShardingOptimizer phase profile: phase=strategy_enumeration mesh_shape=(4, 4, 4) mesh_dim_names=('dp', 'tp', 'cp') mesh_size=64 model_params=1.24B graph_nodes=4140 strategy_options=662279 option_tuples=181062856 elapsed=459.509s
-2026-05-26 15:07:42,067 INFO:autoparallel.optimize_sharding:ShardingOptimizer phase profile: phase=decision_vars mesh_shape=(4, 4, 4) mesh_dim_names=('dp', 'tp', 'cp') mesh_size=64 model_params=1.24B unique_ilp_vars=20390366 logical_decision_vars=181062856 cluster_copied_decision_vars=160672490 elapsed=462.310s
-[15:10:23] done model=1B mesh_ndim=3 rc=124
-[15:10:23] start model=1B mesh_ndim=4 skip_solve timeout=1200s
-2026-05-26 15:10:32,788 INFO:autoparallel.api:Graph tracing took 6.079s
diff --git a/profile_results/real_llama3_optimizer_sweep.csv b/profile_results/real_llama3_optimizer_sweep.csv
deleted file mode 100644
index 30d2e4f5..00000000
--- a/profile_results/real_llama3_optimizer_sweep.csv
+++ /dev/null
@@ -1,9 +0,0 @@
-cluster_copied_decision_vars,compute_cost_estimation_s,constraints_init,constraints_solve,cost_estimation_s,decision_var_build_s,decision_var_overhead_s,edge_cost_estimation_s,extract_s,graph_nodes,ilp_construction_s,logical_decision_vars,max_strategies_per_node,mesh_ndim,mesh_shape,mesh_size,model_key,objective,objective_s,optimizer_pipeline_s,option_tuples,parameter_b,parameter_gib,parameter_nodes,parameter_numel,solve_s,status,strategy_enumeration_s,strategy_options,tensor_nodes,total_wall_s,unique_ilp_vars,validation_s
-101888,0.4790569522883743,7038,7042,0.5384756466373801,0.6978608381468803,0.09741721651516855,0.05941869434900582,0.01627982617355883,4140,0.26339267240837216,114928,10,1,64,64,1B,75411.02054353141,0.053351440001279116,3.032338660908863,114928,1.2358144,4.603767395019531,146,1235814400,0.49132931185886264,Optimal,0.6722944700159132,18503,4139,8.946083615999669,13040,0.31402136106044054
-194792,0.48607375379651785,8080,8084,0.5489266884978861,0.7471572819631547,0.1306607834994793,0.06285293470136821,0.029804171063005924,7200,0.3148333504796028,208698,10,1,64,64,3B,155857.5709074804,0.05978171294555068,4.169702837942168,208698,3.212749824,11.968425750732422,254,3212749824,0.5855939809698611,Optimal,0.5360530489124358,32969,7199,14.472710577072576,13906,0.03955800808034837
-224240,0.49045672081410885,8372,8376,0.5536671618465334,1.1619730349630117,0.5399796243291348,0.06321044103242457,0.03362119919620454,8220,0.7288401401601732,238203,10,1,64,64,8B,213343.3574716149,0.05892709596082568,4.762722868937999,238203,8.030261248,29.915054321289062,291,8030261248,0.5859912640880793,Optimal,0.9387421838473529,37635,8219,16.452271425863728,13963,0.045778295025229454
-596400,0.5983547926880419,12044,12048,0.6777467841748148,2.653722374001518,1.875488000921905,0.0793919914867729,0.2056352950166911,20460,2.2220516917295754,612283,10,1,64,64,70B,965500.0409067452,0.0730493909213692,20.028923405101523,612283,70.553706496,262.8330383300781,723,70553706496,1.5257919810246676,Optimal,3.3026473850477487,95379,20459,50.90600106609054,15883,0.1628595821093768
-946046,0.4775047143921256,15494,15498,0.5445251299533993,2.283439102116972,1.6354041469749063,0.0670204155612737,0.17483325605280697,32190,2.005914915120229,963447,10,1,64,64,405B,3172012.7008089907,0.06962158717215061,29.85055986023508,963447,405.8533888,1511.9216918945312,1137,405853388800,2.56223003892228,Optimal,2.5583339028526098,150073,32189,77.86599416891113,17401,0.18959671608172357
-3854214,1.9979437342844903,173186,173190,4.75627763918601,11.933482899097726,4.112962566781789,2.7583339049015194,0.03040059795603156,4140,10.42051934893243,4337060,82,2,8x8,64,1B,57041.81060181375,2.17517895414494,109.2090197771322,4337060,1.2358144,4.603767395019531,146,1235814400,80.18635749211535,Optimal,8.398531069047749,107753,4139,115.10326781589538,482846,0.024392321007326245
-7135218,2.101260715862736,176564,176568,5.0140090675558895,14.759843383915722,6.347998866345733,2.9127483516931534,0.04800663981586695,7200,14.323183785192668,7623714,82,2,8x8,64,3B,122291.9385011857,2.4431078990455717,118.39831594773568,7623714,3.212749824,11.968425750732422,254,3212749824,78.84844117495231,Optimal,9.923545255092904,188315,7199,130.30269417585805,488496,0.053027451038360596
-8216282,1.9884659524541348,177172,177176,4.743945160182193,13.453818985959515,5.6245344209019095,2.755479207728058,0.04394924081861973,8220,11.563520586816594,8703393,82,2,8x8,64,8B,178228.3264244111,3.2896198199596256,123.55457829684019,8703393,8.030261248,29.915054321289062,291,8030261248,86.02262015617453,Optimal,9.262494687922299,214965,8219,135.2341975120362,487111,0.0497884638607502
diff --git a/profile_results/real_llama3_optimizer_sweep.jsonl b/profile_results/real_llama3_optimizer_sweep.jsonl
deleted file mode 100644
index 67428955..00000000
--- a/profile_results/real_llama3_optimizer_sweep.jsonl
+++ /dev/null
@@ -1,8 +0,0 @@
-{"cluster_copied_decision_vars": 101888, "compute_cost_estimation_s": 0.4790569522883743, "constraints_init": 7038, "constraints_solve": 7042, "cost_estimation_s": 0.5384756466373801, "decision_var_build_s": 0.6978608381468803, "decision_var_overhead_s": 0.09741721651516855, "edge_cost_estimation_s": 0.05941869434900582, "extract_s": 0.01627982617355883, "graph_nodes": 4140, "ilp_construction_s": 0.26339267240837216, "logical_decision_vars": 114928, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "1B", "objective": 75411.02054353141, "objective_s": 0.053351440001279116, "optimizer_pipeline_s": 3.032338660908863, "option_tuples": 114928, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 0.49132931185886264, "status": "Optimal", "strategy_enumeration_s": 0.6722944700159132, "strategy_options": 18503, "tensor_nodes": 4139, "total_wall_s": 8.946083615999669, "unique_ilp_vars": 13040, "validation_s": 0.31402136106044054}
-{"cluster_copied_decision_vars": 194792, "compute_cost_estimation_s": 0.48607375379651785, "constraints_init": 8080, "constraints_solve": 8084, "cost_estimation_s": 0.5489266884978861, "decision_var_build_s": 0.7471572819631547, "decision_var_overhead_s": 0.1306607834994793, "edge_cost_estimation_s": 0.06285293470136821, "extract_s": 0.029804171063005924, "graph_nodes": 7200, "ilp_construction_s": 0.3148333504796028, "logical_decision_vars": 208698, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "3B", "objective": 155857.5709074804, "objective_s": 0.05978171294555068, "optimizer_pipeline_s": 4.169702837942168, "option_tuples": 208698, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 0.5855939809698611, "status": "Optimal", "strategy_enumeration_s": 0.5360530489124358, "strategy_options": 32969, "tensor_nodes": 7199, "total_wall_s": 14.472710577072576, "unique_ilp_vars": 13906, "validation_s": 0.03955800808034837}
-{"cluster_copied_decision_vars": 224240, "compute_cost_estimation_s": 0.49045672081410885, "constraints_init": 8372, "constraints_solve": 8376, "cost_estimation_s": 0.5536671618465334, "decision_var_build_s": 1.1619730349630117, "decision_var_overhead_s": 0.5399796243291348, "edge_cost_estimation_s": 0.06321044103242457, "extract_s": 0.03362119919620454, "graph_nodes": 8220, "ilp_construction_s": 0.7288401401601732, "logical_decision_vars": 238203, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "8B", "objective": 213343.3574716149, "objective_s": 0.05892709596082568, "optimizer_pipeline_s": 4.762722868937999, "option_tuples": 238203, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 0.5859912640880793, "status": "Optimal", "strategy_enumeration_s": 0.9387421838473529, "strategy_options": 37635, "tensor_nodes": 8219, "total_wall_s": 16.452271425863728, "unique_ilp_vars": 13963, "validation_s": 0.045778295025229454}
-{"cluster_copied_decision_vars": 596400, "compute_cost_estimation_s": 0.5983547926880419, "constraints_init": 12044, "constraints_solve": 12048, "cost_estimation_s": 0.6777467841748148, "decision_var_build_s": 2.653722374001518, "decision_var_overhead_s": 1.875488000921905, "edge_cost_estimation_s": 0.0793919914867729, "extract_s": 0.2056352950166911, "graph_nodes": 20460, "ilp_construction_s": 2.2220516917295754, "logical_decision_vars": 612283, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "70B", "objective": 965500.0409067452, "objective_s": 0.0730493909213692, "optimizer_pipeline_s": 20.028923405101523, "option_tuples": 612283, "parameter_b": 70.553706496, "parameter_gib": 262.8330383300781, "parameter_nodes": 723, "parameter_numel": 70553706496, "solve_s": 1.5257919810246676, "status": "Optimal", "strategy_enumeration_s": 3.3026473850477487, "strategy_options": 95379, "tensor_nodes": 20459, "total_wall_s": 50.90600106609054, "unique_ilp_vars": 15883, "validation_s": 0.1628595821093768}
-{"cluster_copied_decision_vars": 946046, "compute_cost_estimation_s": 0.4775047143921256, "constraints_init": 15494, "constraints_solve": 15498, "cost_estimation_s": 0.5445251299533993, "decision_var_build_s": 2.283439102116972, "decision_var_overhead_s": 1.6354041469749063, "edge_cost_estimation_s": 0.0670204155612737, "extract_s": 0.17483325605280697, "graph_nodes": 32190, "ilp_construction_s": 2.005914915120229, "logical_decision_vars": 963447, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "405B", "objective": 3172012.7008089907, "objective_s": 0.06962158717215061, "optimizer_pipeline_s": 29.85055986023508, "option_tuples": 963447, "parameter_b": 405.8533888, "parameter_gib": 1511.9216918945312, "parameter_nodes": 1137, "parameter_numel": 405853388800, "solve_s": 2.56223003892228, "status": "Optimal", "strategy_enumeration_s": 2.5583339028526098, "strategy_options": 150073, "tensor_nodes": 32189, "total_wall_s": 77.86599416891113, "unique_ilp_vars": 17401, "validation_s": 0.18959671608172357}
-{"cluster_copied_decision_vars": 3854214, "compute_cost_estimation_s": 1.9979437342844903, "constraints_init": 173186, "constraints_solve": 173190, "cost_estimation_s": 4.75627763918601, "decision_var_build_s": 11.933482899097726, "decision_var_overhead_s": 4.112962566781789, "edge_cost_estimation_s": 2.7583339049015194, "extract_s": 0.03040059795603156, "graph_nodes": 4140, "ilp_construction_s": 10.42051934893243, "logical_decision_vars": 4337060, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "1B", "objective": 57041.81060181375, "objective_s": 2.17517895414494, "optimizer_pipeline_s": 109.2090197771322, "option_tuples": 4337060, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 80.18635749211535, "status": "Optimal", "strategy_enumeration_s": 8.398531069047749, "strategy_options": 107753, "tensor_nodes": 4139, "total_wall_s": 115.10326781589538, "unique_ilp_vars": 482846, "validation_s": 0.024392321007326245}
-{"cluster_copied_decision_vars": 7135218, "compute_cost_estimation_s": 2.101260715862736, "constraints_init": 176564, "constraints_solve": 176568, "cost_estimation_s": 5.0140090675558895, "decision_var_build_s": 14.759843383915722, "decision_var_overhead_s": 6.347998866345733, "edge_cost_estimation_s": 2.9127483516931534, "extract_s": 0.04800663981586695, "graph_nodes": 7200, "ilp_construction_s": 14.323183785192668, "logical_decision_vars": 7623714, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "3B", "objective": 122291.9385011857, "objective_s": 2.4431078990455717, "optimizer_pipeline_s": 118.39831594773568, "option_tuples": 7623714, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 78.84844117495231, "status": "Optimal", "strategy_enumeration_s": 9.923545255092904, "strategy_options": 188315, "tensor_nodes": 7199, "total_wall_s": 130.30269417585805, "unique_ilp_vars": 488496, "validation_s": 0.053027451038360596}
-{"cluster_copied_decision_vars": 8216282, "compute_cost_estimation_s": 1.9884659524541348, "constraints_init": 177172, "constraints_solve": 177176, "cost_estimation_s": 4.743945160182193, "decision_var_build_s": 13.453818985959515, "decision_var_overhead_s": 5.6245344209019095, "edge_cost_estimation_s": 2.755479207728058, "extract_s": 0.04394924081861973, "graph_nodes": 8220, "ilp_construction_s": 11.563520586816594, "logical_decision_vars": 8703393, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "8B", "objective": 178228.3264244111, "objective_s": 3.2896198199596256, "optimizer_pipeline_s": 123.55457829684019, "option_tuples": 8703393, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 86.02262015617453, "status": "Optimal", "strategy_enumeration_s": 9.262494687922299, "strategy_options": 214965, "tensor_nodes": 8219, "total_wall_s": 135.2341975120362, "unique_ilp_vars": 487111, "validation_s": 0.0497884638607502}
diff --git a/profile_results/real_llama3_optimizer_sweep.log b/profile_results/real_llama3_optimizer_sweep.log
deleted file mode 100644
index 21b02b4e..00000000
--- a/profile_results/real_llama3_optimizer_sweep.log
+++ /dev/null
@@ -1,54 +0,0 @@
-[14:16:02] start model=1B mesh_ndim=1 timeout=900s
-2026-05-26 14:16:10,889 INFO:autoparallel.api:Graph tracing took 5.582s
-2026-05-26 14:16:13,492 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=1.24B param_nodes=146 graph_nodes=4140 tensor_nodes=4139 strategy_options=18503 option_tuples=114928 unique_ilp_vars=13040 logical_decision_vars=114928 constraints=7038 timings={strategy_enumeration=0.672s,cost_estimation=0.538s,ilp_construction=0.263s,validation=0.314s,total=2.469s}
-2026-05-26 14:16:14,059 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=1.24B unique_ilp_vars=13040 constraints=7042 status=Optimal objective=75411.0205 timings={strategy_enumeration=0.672s,cost_estimation=0.538s,ilp_construction=0.263s,objective=0.053s,solve=0.491s,extract=0.016s,total_solve_call=0.563s,total_pipeline=3.032s}
-{"cluster_copied_decision_vars": 101888, "compute_cost_estimation_s": 0.4790569522883743, "constraints_init": 7038, "constraints_solve": 7042, "cost_estimation_s": 0.5384756466373801, "decision_var_build_s": 0.6978608381468803, "decision_var_overhead_s": 0.09741721651516855, "edge_cost_estimation_s": 0.05941869434900582, "extract_s": 0.01627982617355883, "graph_nodes": 4140, "ilp_construction_s": 0.26339267240837216, "logical_decision_vars": 114928, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "1B", "objective": 75411.02054353141, "objective_s": 0.053351440001279116, "optimizer_pipeline_s": 3.032338660908863, "option_tuples": 114928, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 0.49132931185886264, "status": "Optimal", "strategy_enumeration_s": 0.6722944700159132, "strategy_options": 18503, "tensor_nodes": 4139, "total_wall_s": 8.946083615999669, "unique_ilp_vars": 13040, "validation_s": 0.31402136106044054}
-[14:16:15] done model=1B mesh_ndim=1 rc=0
-[14:16:15] start model=3B mesh_ndim=1 timeout=900s
-2026-05-26 14:16:27,671 INFO:autoparallel.api:Graph tracing took 9.505s
-2026-05-26 14:16:31,732 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=3.21B param_nodes=254 graph_nodes=7200 tensor_nodes=7199 strategy_options=32969 option_tuples=208698 unique_ilp_vars=13906 logical_decision_vars=208698 constraints=8080 timings={strategy_enumeration=0.536s,cost_estimation=0.549s,ilp_construction=0.315s,validation=0.040s,total=3.492s}
-2026-05-26 14:16:32,416 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=3.21B unique_ilp_vars=13906 constraints=8084 status=Optimal objective=155857.5709 timings={strategy_enumeration=0.536s,cost_estimation=0.549s,ilp_construction=0.315s,objective=0.060s,solve=0.586s,extract=0.030s,total_solve_call=0.678s,total_pipeline=4.170s}
-{"cluster_copied_decision_vars": 194792, "compute_cost_estimation_s": 0.48607375379651785, "constraints_init": 8080, "constraints_solve": 8084, "cost_estimation_s": 0.5489266884978861, "decision_var_build_s": 0.7471572819631547, "decision_var_overhead_s": 0.1306607834994793, "edge_cost_estimation_s": 0.06285293470136821, "extract_s": 0.029804171063005924, "graph_nodes": 7200, "ilp_construction_s": 0.3148333504796028, "logical_decision_vars": 208698, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "3B", "objective": 155857.5709074804, "objective_s": 0.05978171294555068, "optimizer_pipeline_s": 4.169702837942168, "option_tuples": 208698, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 0.5855939809698611, "status": "Optimal", "strategy_enumeration_s": 0.5360530489124358, "strategy_options": 32969, "tensor_nodes": 7199, "total_wall_s": 14.472710577072576, "unique_ilp_vars": 13906, "validation_s": 0.03955800808034837}
-[14:16:33] done model=3B mesh_ndim=1 rc=0
-[14:16:33] start model=8B mesh_ndim=1 timeout=900s
-2026-05-26 14:16:47,847 INFO:autoparallel.api:Graph tracing took 11.170s
-2026-05-26 14:16:52,205 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=8.03B param_nodes=291 graph_nodes=8220 tensor_nodes=8219 strategy_options=37635 option_tuples=238203 unique_ilp_vars=13963 logical_decision_vars=238203 constraints=8372 timings={strategy_enumeration=0.939s,cost_estimation=0.554s,ilp_construction=0.729s,validation=0.046s,total=4.081s}
-2026-05-26 14:16:52,893 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=8.03B unique_ilp_vars=13963 constraints=8376 status=Optimal objective=213343.3575 timings={strategy_enumeration=0.939s,cost_estimation=0.554s,ilp_construction=0.729s,objective=0.059s,solve=0.586s,extract=0.034s,total_solve_call=0.681s,total_pipeline=4.763s}
-{"cluster_copied_decision_vars": 224240, "compute_cost_estimation_s": 0.49045672081410885, "constraints_init": 8372, "constraints_solve": 8376, "cost_estimation_s": 0.5536671618465334, "decision_var_build_s": 1.1619730349630117, "decision_var_overhead_s": 0.5399796243291348, "edge_cost_estimation_s": 0.06321044103242457, "extract_s": 0.03362119919620454, "graph_nodes": 8220, "ilp_construction_s": 0.7288401401601732, "logical_decision_vars": 238203, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "8B", "objective": 213343.3574716149, "objective_s": 0.05892709596082568, "optimizer_pipeline_s": 4.762722868937999, "option_tuples": 238203, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 0.5859912640880793, "status": "Optimal", "strategy_enumeration_s": 0.9387421838473529, "strategy_options": 37635, "tensor_nodes": 8219, "total_wall_s": 16.452271425863728, "unique_ilp_vars": 13963, "validation_s": 0.045778295025229454}
-[14:16:54] done model=8B mesh_ndim=1 rc=0
-[14:16:54] start model=70B mesh_ndim=1 timeout=900s
-2026-05-26 14:17:27,109 INFO:autoparallel.api:Graph tracing took 29.053s
-2026-05-26 14:17:46,179 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=70.55B param_nodes=723 graph_nodes=20460 tensor_nodes=20459 strategy_options=95379 option_tuples=612283 unique_ilp_vars=15883 logical_decision_vars=612283 constraints=12044 timings={strategy_enumeration=3.303s,cost_estimation=0.678s,ilp_construction=2.222s,validation=0.163s,total=18.219s}
-2026-05-26 14:17:48,011 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=70.55B unique_ilp_vars=15883 constraints=12048 status=Optimal objective=965500.0409 timings={strategy_enumeration=3.303s,cost_estimation=0.678s,ilp_construction=2.222s,objective=0.073s,solve=1.526s,extract=0.206s,total_solve_call=1.810s,total_pipeline=20.029s}
-{"cluster_copied_decision_vars": 596400, "compute_cost_estimation_s": 0.5983547926880419, "constraints_init": 12044, "constraints_solve": 12048, "cost_estimation_s": 0.6777467841748148, "decision_var_build_s": 2.653722374001518, "decision_var_overhead_s": 1.875488000921905, "edge_cost_estimation_s": 0.0793919914867729, "extract_s": 0.2056352950166911, "graph_nodes": 20460, "ilp_construction_s": 2.2220516917295754, "logical_decision_vars": 612283, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "70B", "objective": 965500.0409067452, "objective_s": 0.0730493909213692, "optimizer_pipeline_s": 20.028923405101523, "option_tuples": 612283, "parameter_b": 70.553706496, "parameter_gib": 262.8330383300781, "parameter_nodes": 723, "parameter_numel": 70553706496, "solve_s": 1.5257919810246676, "status": "Optimal", "strategy_enumeration_s": 3.3026473850477487, "strategy_options": 95379, "tensor_nodes": 20459, "total_wall_s": 50.90600106609054, "unique_ilp_vars": 15883, "validation_s": 0.1628595821093768}
-[14:17:51] done model=70B mesh_ndim=1 rc=0
-[14:17:51] start model=405B mesh_ndim=1 timeout=900s
-2026-05-26 14:18:40,587 INFO:autoparallel.api:Graph tracing took 45.218s
-2026-05-26 14:19:09,868 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=405.85B param_nodes=1137 graph_nodes=32190 tensor_nodes=32189 strategy_options=150073 option_tuples=963447 unique_ilp_vars=17401 logical_decision_vars=963447 constraints=15494 timings={strategy_enumeration=2.558s,cost_estimation=0.545s,ilp_construction=2.006s,validation=0.190s,total=27.039s}
-2026-05-26 14:19:12,705 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(64,) mesh_dim_names=('dp',) mesh_size=64 model_params=405.85B unique_ilp_vars=17401 constraints=15498 status=Optimal objective=3172012.7008 timings={strategy_enumeration=2.558s,cost_estimation=0.545s,ilp_construction=2.006s,objective=0.070s,solve=2.562s,extract=0.175s,total_solve_call=2.811s,total_pipeline=29.851s}
-{"cluster_copied_decision_vars": 946046, "compute_cost_estimation_s": 0.4775047143921256, "constraints_init": 15494, "constraints_solve": 15498, "cost_estimation_s": 0.5445251299533993, "decision_var_build_s": 2.283439102116972, "decision_var_overhead_s": 1.6354041469749063, "edge_cost_estimation_s": 0.0670204155612737, "extract_s": 0.17483325605280697, "graph_nodes": 32190, "ilp_construction_s": 2.005914915120229, "logical_decision_vars": 963447, "max_strategies_per_node": 10, "mesh_ndim": 1, "mesh_shape": "64", "mesh_size": 64, "model_key": "405B", "objective": 3172012.7008089907, "objective_s": 0.06962158717215061, "optimizer_pipeline_s": 29.85055986023508, "option_tuples": 963447, "parameter_b": 405.8533888, "parameter_gib": 1511.9216918945312, "parameter_nodes": 1137, "parameter_numel": 405853388800, "solve_s": 2.56223003892228, "status": "Optimal", "strategy_enumeration_s": 2.5583339028526098, "strategy_options": 150073, "tensor_nodes": 32189, "total_wall_s": 77.86599416891113, "unique_ilp_vars": 17401, "validation_s": 0.18959671608172357}
-[14:19:15] done model=405B mesh_ndim=1 rc=0
-[14:19:15] start model=1B mesh_ndim=2 timeout=900s
-2026-05-26 14:19:24,184 INFO:autoparallel.api:Graph tracing took 5.551s
-2026-05-26 14:19:51,030 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=1.24B param_nodes=146 graph_nodes=4140 tensor_nodes=4139 strategy_options=107753 option_tuples=4337060 unique_ilp_vars=482846 logical_decision_vars=4337060 constraints=173186 timings={strategy_enumeration=8.399s,cost_estimation=4.756s,ilp_construction=10.421s,validation=0.024s,total=26.710s}
-2026-05-26 14:21:13,538 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=1.24B unique_ilp_vars=482846 constraints=173190 status=Optimal objective=57041.8106 timings={strategy_enumeration=8.399s,cost_estimation=4.756s,ilp_construction=10.421s,objective=2.175s,solve=80.186s,extract=0.030s,total_solve_call=82.499s,total_pipeline=109.209s}
-{"cluster_copied_decision_vars": 3854214, "compute_cost_estimation_s": 1.9979437342844903, "constraints_init": 173186, "constraints_solve": 173190, "cost_estimation_s": 4.75627763918601, "decision_var_build_s": 11.933482899097726, "decision_var_overhead_s": 4.112962566781789, "edge_cost_estimation_s": 2.7583339049015194, "extract_s": 0.03040059795603156, "graph_nodes": 4140, "ilp_construction_s": 10.42051934893243, "logical_decision_vars": 4337060, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "1B", "objective": 57041.81060181375, "objective_s": 2.17517895414494, "optimizer_pipeline_s": 109.2090197771322, "option_tuples": 4337060, "parameter_b": 1.2358144, "parameter_gib": 4.603767395019531, "parameter_nodes": 146, "parameter_numel": 1235814400, "solve_s": 80.18635749211535, "status": "Optimal", "strategy_enumeration_s": 8.398531069047749, "strategy_options": 107753, "tensor_nodes": 4139, "total_wall_s": 115.10326781589538, "unique_ilp_vars": 482846, "validation_s": 0.024392321007326245}
-[14:21:16] done model=1B mesh_ndim=2 rc=0
-[14:21:16] start model=3B mesh_ndim=2 timeout=900s
-2026-05-26 14:21:30,429 INFO:autoparallel.api:Graph tracing took 10.867s
-2026-05-26 14:22:08,135 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=3.21B param_nodes=254 graph_nodes=7200 tensor_nodes=7199 strategy_options=188315 option_tuples=7623714 unique_ilp_vars=488496 logical_decision_vars=7623714 constraints=176564 timings={strategy_enumeration=9.924s,cost_estimation=5.014s,ilp_construction=14.323s,validation=0.053s,total=36.956s}
-2026-05-26 14:23:29,596 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=3.21B unique_ilp_vars=488496 constraints=176568 status=Optimal objective=122291.9385 timings={strategy_enumeration=9.924s,cost_estimation=5.014s,ilp_construction=14.323s,objective=2.443s,solve=78.848s,extract=0.048s,total_solve_call=81.443s,total_pipeline=118.398s}
-{"cluster_copied_decision_vars": 7135218, "compute_cost_estimation_s": 2.101260715862736, "constraints_init": 176564, "constraints_solve": 176568, "cost_estimation_s": 5.0140090675558895, "decision_var_build_s": 14.759843383915722, "decision_var_overhead_s": 6.347998866345733, "edge_cost_estimation_s": 2.9127483516931534, "extract_s": 0.04800663981586695, "graph_nodes": 7200, "ilp_construction_s": 14.323183785192668, "logical_decision_vars": 7623714, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "3B", "objective": 122291.9385011857, "objective_s": 2.4431078990455717, "optimizer_pipeline_s": 118.39831594773568, "option_tuples": 7623714, "parameter_b": 3.212749824, "parameter_gib": 11.968425750732422, "parameter_nodes": 254, "parameter_numel": 3212749824, "solve_s": 78.84844117495231, "status": "Optimal", "strategy_enumeration_s": 9.923545255092904, "strategy_options": 188315, "tensor_nodes": 7199, "total_wall_s": 130.30269417585805, "unique_ilp_vars": 488496, "validation_s": 0.053027451038360596}
-[14:23:32] done model=3B mesh_ndim=2 rc=0
-[14:23:32] start model=8B mesh_ndim=2 timeout=900s
-2026-05-26 14:23:46,265 INFO:autoparallel.api:Graph tracing took 11.149s
-2026-05-26 14:24:20,655 INFO:autoparallel.optimize_sharding:ShardingOptimizer init profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=8.03B param_nodes=291 graph_nodes=8220 tensor_nodes=8219 strategy_options=214965 option_tuples=8703393 unique_ilp_vars=487111 logical_decision_vars=8703393 constraints=177172 timings={strategy_enumeration=9.262s,cost_estimation=4.744s,ilp_construction=11.564s,validation=0.050s,total=34.114s}
-2026-05-26 14:25:50,114 INFO:autoparallel.optimize_sharding:ShardingOptimizer solve profile: mesh_shape=(8, 8) mesh_dim_names=('dp', 'tp') mesh_size=64 model_params=8.03B unique_ilp_vars=487111 constraints=177176 status=Optimal objective=178228.3264 timings={strategy_enumeration=9.262s,cost_estimation=4.744s,ilp_construction=11.564s,objective=3.290s,solve=86.023s,extract=0.044s,total_solve_call=89.441s,total_pipeline=123.555s}
-{"cluster_copied_decision_vars": 8216282, "compute_cost_estimation_s": 1.9884659524541348, "constraints_init": 177172, "constraints_solve": 177176, "cost_estimation_s": 4.743945160182193, "decision_var_build_s": 13.453818985959515, "decision_var_overhead_s": 5.6245344209019095, "edge_cost_estimation_s": 2.755479207728058, "extract_s": 0.04394924081861973, "graph_nodes": 8220, "ilp_construction_s": 11.563520586816594, "logical_decision_vars": 8703393, "max_strategies_per_node": 82, "mesh_ndim": 2, "mesh_shape": "8x8", "mesh_size": 64, "model_key": "8B", "objective": 178228.3264244111, "objective_s": 3.2896198199596256, "optimizer_pipeline_s": 123.55457829684019, "option_tuples": 8703393, "parameter_b": 8.030261248, "parameter_gib": 29.915054321289062, "parameter_nodes": 291, "parameter_numel": 8030261248, "solve_s": 86.02262015617453, "status": "Optimal", "strategy_enumeration_s": 9.262494687922299, "strategy_options": 214965, "tensor_nodes": 8219, "total_wall_s": 135.2341975120362, "unique_ilp_vars": 487111, "validation_s": 0.0497884638607502}
-[14:25:52] done model=8B mesh_ndim=2 rc=0
-[14:25:52] start model=1B mesh_ndim=3 timeout=300s
-2026-05-26 14:26:01,331 INFO:autoparallel.api:Graph tracing took 5.531s
-[14:30:53] done model=1B mesh_ndim=3 rc=124
-[14:30:53] start model=1B mesh_ndim=4 timeout=300s
-2026-05-26 14:31:01,610 INFO:autoparallel.api:Graph tracing took 5.635s
-[14:35:53] done model=1B mesh_ndim=4 rc=124
diff --git a/profile_results/real_llama3_optimizer_sweep.py b/profile_results/real_llama3_optimizer_sweep.py
deleted file mode 100644
index 7e32b14c..00000000
--- a/profile_results/real_llama3_optimizer_sweep.py
+++ /dev/null
@@ -1,351 +0,0 @@
-import argparse
-import csv
-import json
-import logging
-import math
-import sys
-import time
-from pathlib import Path
-
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-sys.path.insert(0, "/home/wangkj/workspace/torchtitan")
-
-from torchtitan.models.llama3 import llama3_configs  # noqa: E402
-
-from autoparallel.api import AutoParallel
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-
-WORLD_SIZE = 64
-SEQ_LEN = 256
-GLOBAL_BATCH = 64
-MESHES = {
-    1: ((64,), ("dp",)),
-    2: ((8, 8), ("dp", "tp")),
-    3: ((4, 4, 4), ("dp", "tp", "cp")),
-    4: ((4, 4, 2, 2), ("dp", "tp", "cp", "ep")),
-}
-
-
-def init_dist():
-    if not torch.distributed.is_initialized():
-        torch.distributed.init_process_group(
-            "fake", store=FakeStore(), rank=0, world_size=WORLD_SIZE
-        )
-
-
-def flatten_profile(model_key, mesh_ndim, profile, total_wall_s, solve_ran):
-    model = profile["model"]
-    timings = profile["timings"]
-    strategies = profile["strategies"]
-    ilp = profile["ilp"]
-    solve = profile.get("last_solve", {})
-    return {
-        "model_key": model_key,
-        "mesh_ndim": mesh_ndim,
-        "mesh_shape": "x".join(map(str, profile["mesh"]["shape"])),
-        "mesh_size": profile["mesh"]["size"],
-        "parameter_numel": model["parameter_numel"],
-        "parameter_b": model["parameter_numel"] / 1_000_000_000,
-        "parameter_gib": model["parameter_bytes"] / (1024**3),
-        "graph_nodes": model["graph_nodes"],
-        "tensor_nodes": model["tensor_nodes"],
-        "parameter_nodes": model["parameter_nodes"],
-        "strategy_options": strategies["strategy_options"],
-        "option_tuples": strategies["option_tuples"],
-        "max_strategies_per_node": strategies["max_strategies_per_node"],
-        "unique_ilp_vars": ilp["unique_variables"],
-        "logical_decision_vars": ilp["logical_decision_variables"],
-        "cluster_copied_decision_vars": ilp["cluster_copied_decision_variables"],
-        "constraints_init": ilp["constraints"],
-        "constraints_presolve": profile.get("constraints_presolve", ilp["constraints"]),
-        "constraints_solve": solve.get("constraints", ""),
-        "strategy_enumeration_s": timings["strategy_enumeration_s"],
-        "compute_cost_estimation_s": timings["compute_cost_estimation_s"],
-        "edge_cost_estimation_s": timings["edge_cost_estimation_s"],
-        "cost_estimation_s": timings["cost_estimation_s"],
-        "decision_var_build_s": timings["decision_var_build_s"],
-        "decision_var_overhead_s": timings["decision_var_overhead_s"],
-        "ilp_construction_s": timings["ilp_construction_s"],
-        "validation_s": timings["validation_s"],
-        "objective_s": solve.get("objective_s", ""),
-        "solve_s": solve.get("solve_s", ""),
-        "extract_s": solve.get("extract_s", ""),
-        "optimizer_pipeline_s": solve.get(
-            "pipeline_total_s",
-            timings["init_total_s"],
-        ),
-        "total_wall_s": total_wall_s,
-        "objective": solve.get("objective", ""),
-        "status": solve.get("status", "NotSolved"),
-        "solve_ran": solve_ran,
-    }
-
-
-def run_one(model_key, mesh_ndim, skip_solve=False):
-    init_dist()
-    mesh_shape, mesh_dim_names = MESHES[mesh_ndim]
-    mesh = torch.distributed.device_mesh.init_device_mesh(
-        "cuda", mesh_shape, mesh_dim_names=mesh_dim_names
-    )
-    set_nccl_topo_config(detect_nccl_topo_config(mesh))
-
-    config = llama3_configs[model_key](attn_backend="sdpa")
-    config.rope.max_seq_len = SEQ_LEN
-    with torch.device("meta"):
-        model = config.build()
-
-    def input_fn():
-        return torch.randint(
-            0,
-            config.vocab_size,
-            (GLOBAL_BATCH, SEQ_LEN),
-            device="cuda",
-        )
-
-    mp_policy = MixedPrecisionPolicy(
-        param_dtype=torch.bfloat16, reduce_dtype=torch.float32
-    )
-    t0 = time.perf_counter()
-    with AutoParallel(
-        model,
-        input_fn,
-        mesh,
-        mp_policy,
-        repeated_subgraphs=True,
-    ) as autop:
-        autop.add_parameter_memory_constraint(low=None, high=None)
-        input_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1)
-        if mesh.ndim == 1:
-            output_sharding = (Shard(0),)
-        else:
-            output_sharding = (Shard(0), Shard(2)) + (Replicate(),) * (
-                mesh.ndim - 2
-            )
-        autop.add_input_constraints([input_sharding])
-        autop.add_output_constraints([output_sharding])
-        autop.sharding_optimizer.profile["constraints_presolve"] = len(
-            autop.sharding_optimizer.prob.constraints
-        )
-        if not skip_solve:
-            autop.optimize_placement(verbose=False)
-        profile = autop.sharding_optimizer.profile
-    return flatten_profile(
-        model_key,
-        mesh_ndim,
-        profile,
-        time.perf_counter() - t0,
-        solve_ran=not skip_solve,
-    )
-
-
-def append_jsonl(path, row):
-    path = Path(path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with path.open("a") as f:
-        f.write(json.dumps(row, sort_keys=True) + "\n")
-
-
-def load_rows(path):
-    rows = []
-    with Path(path).open() as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                row = json.loads(line)
-                row.setdefault(
-                    "constraints_presolve",
-                    row.get("constraints_solve") or row.get("constraints_init"),
-                )
-                row.setdefault("solve_ran", row.get("solve_s", "") != "")
-                rows.append(row)
-    rows.sort(key=lambda r: (r["mesh_ndim"], r["parameter_numel"]))
-    return rows
-
-
-def write_csv(rows, path):
-    fields = []
-    for row in rows:
-        for key in row:
-            if key not in fields:
-                fields.append(key)
-    with Path(path).open("w", newline="") as f:
-        writer = csv.DictWriter(f, fieldnames=fields)
-        writer.writeheader()
-        writer.writerows(rows)
-
-
-def nice(v):
-    if v >= 1_000_000_000:
-        return f"{v / 1_000_000_000:.1f}B"
-    if v >= 1_000_000:
-        return f"{v / 1_000_000:.1f}M"
-    if v >= 1_000:
-        return f"{v / 1_000:.1f}K"
-    if v >= 10:
-        return f"{v:.0f}"
-    return f"{v:.2g}"
-
-
-def write_svg(rows, path, x_key, series_key, title):
-    metrics = [
-        ("strategy_enumeration_s", "strategy enum (s)"),
-        ("cost_estimation_s", "cost estimation (s)"),
-        ("ilp_construction_s", "ILP construction (s)"),
-        ("objective_s", "objective build (s)"),
-        ("solve_s", "solve (s)"),
-        ("optimizer_pipeline_s", "pipeline total (s)"),
-        ("unique_ilp_vars", "unique ILP vars"),
-        ("constraints_presolve", "constraints"),
-    ]
-    width = 1600
-    height = 1000
-    panel_w = 360
-    panel_h = 180
-    margin_l = 62
-    margin_t = 120
-    gap_x = 30
-    gap_y = 50
-    colors = ["#2563eb", "#dc2626", "#16a34a", "#9333ea", "#ea580c"]
-
-    def sx(x, xs, px):
-        lo, hi = min(xs), max(xs)
-        if lo == hi:
-            return px + panel_w / 2
-        return px + (x - lo) / (hi - lo) * panel_w
-
-    def sy(y, ys, py):
-        positives = [v for v in ys if v > 0]
-        lo = min(positives)
-        hi = max(positives)
-        if lo == hi:
-            return py + panel_h / 2
-        return py + panel_h - (math.log10(max(y, lo)) - math.log10(lo)) / (
-            math.log10(hi) - math.log10(lo)
-        ) * panel_h
-
-    series_values = sorted({r[series_key] for r in rows})
-    x_values = sorted({float(r[x_key]) for r in rows})
-    svg = [
-        f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">',
-        '<rect width="100%" height="100%" fill="white"/>',
-        f'<text x="32" y="34" font-family="sans-serif" font-size="22" font-weight="700">{title}</text>',
-        '<text x="32" y="58" font-family="sans-serif" font-size="12" fill="#475569">Y axes are log scale. Missing series points timed out or were not run.</text>',
-    ]
-    for i, value in enumerate(series_values):
-        x = 32 + (i % 8) * 180
-        y = 84 + (i // 8) * 20
-        svg.append(
-            f'<line x1="{x}" y1="{y}" x2="{x + 28}" y2="{y}" stroke="{colors[i % len(colors)]}" stroke-width="3"/>'
-        )
-        svg.append(
-            f'<text x="{x + 36}" y="{y + 4}" font-family="sans-serif" font-size="12" fill="#334155">{series_key}={value}</text>'
-        )
-
-    for idx, (metric, label) in enumerate(metrics):
-        col = idx % 4
-        row = idx // 4
-        px = margin_l + col * (panel_w + gap_x)
-        py = margin_t + row * (panel_h + gap_y)
-        ys = [
-            float(r[metric])
-            for r in rows
-            if r.get(metric) not in {"", None} and float(r[metric]) > 0
-        ]
-        if not ys:
-            continue
-        svg.extend(
-            [
-                f'<text x="{px}" y="{py - 14}" font-family="sans-serif" font-size="14" font-weight="700">{label}</text>',
-                f'<rect x="{px}" y="{py}" width="{panel_w}" height="{panel_h}" fill="#f8fafc" stroke="#cbd5e1"/>',
-                f'<line x1="{px}" y1="{py + panel_h}" x2="{px + panel_w}" y2="{py + panel_h}" stroke="#64748b"/>',
-                f'<line x1="{px}" y1="{py}" x2="{px}" y2="{py + panel_h}" stroke="#64748b"/>',
-                f'<text x="{px - 50}" y="{py + 12}" font-family="sans-serif" font-size="10" fill="#64748b">{nice(max(ys))}</text>',
-                f'<text x="{px - 50}" y="{py + panel_h}" font-family="sans-serif" font-size="10" fill="#64748b">{nice(min(ys))}</text>',
-            ]
-        )
-        for xv in x_values:
-            svg.append(
-                f'<text x="{sx(xv, x_values, px) - 16}" y="{py + panel_h + 18}" font-family="sans-serif" font-size="10" fill="#64748b">{nice(xv)}</text>'
-            )
-        for sidx, series in enumerate(series_values):
-            pts = sorted(
-                [r for r in rows if r[series_key] == series],
-                key=lambda r: float(r[x_key]),
-            )
-            color = colors[sidx % len(colors)]
-            coords = [
-                (
-                    sx(float(r[x_key]), x_values, px),
-                    sy(float(r[metric]), ys, py),
-                )
-                for r in pts
-                if r.get(metric) not in {"", None} and float(r[metric]) > 0
-            ]
-            if len(coords) >= 2:
-                svg.append(
-                    '<polyline points="'
-                    + " ".join(f"{x:.1f},{y:.1f}" for x, y in coords)
-                    + f'" fill="none" stroke="{color}" stroke-width="2.4"/>'
-                )
-            for x, y in coords:
-                svg.append(f'<circle cx="{x:.1f}" cy="{y:.1f}" r="3.5" fill="{color}"/>')
-    svg.append("</svg>")
-    Path(path).write_text("\n".join(svg))
-
-
-def plot(jsonl, out_dir):
-    out_dir = Path(out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    rows = load_rows(jsonl)
-    write_csv(rows, out_dir / "real_llama3_optimizer_sweep.csv")
-    write_svg(
-        rows,
-        out_dir / "real_llama3_by_model_size.svg",
-        "parameter_b",
-        "mesh_ndim",
-        "Real Llama3 optimizer profile vs model size",
-    )
-    write_svg(
-        rows,
-        out_dir / "real_llama3_by_mesh_dim.svg",
-        "mesh_ndim",
-        "model_key",
-        "Real Llama3 optimizer profile vs mesh dimension",
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    sub = parser.add_subparsers(dest="cmd", required=True)
-    run = sub.add_parser("run-one")
-    run.add_argument("--model-key", choices=llama3_configs.keys(), required=True)
-    run.add_argument("--mesh-ndim", type=int, choices=MESHES.keys(), required=True)
-    run.add_argument("--out-jsonl", required=True)
-    run.add_argument("--skip-solve", action="store_true")
-    plot_cmd = sub.add_parser("plot")
-    plot_cmd.add_argument("--jsonl", required=True)
-    plot_cmd.add_argument("--out-dir", required=True)
-    args = parser.parse_args()
-
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s %(levelname)s:%(name)s:%(message)s",
-    )
-    logging.getLogger("autoparallel.optimize_sharding").setLevel(logging.INFO)
-
-    if args.cmd == "run-one":
-        row = run_one(args.model_key, args.mesh_ndim, skip_solve=args.skip_solve)
-        append_jsonl(args.out_jsonl, row)
-        print(json.dumps(row, sort_keys=True))
-    else:
-        plot(args.jsonl, args.out_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/profile_results/real_llama3_partial_presolve.csv b/profile_results/real_llama3_partial_presolve.csv
deleted file mode 100644
index ab7b7fa9..00000000
--- a/profile_results/real_llama3_partial_presolve.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-model_key,mesh_ndim,mesh_shape,parameter_b,graph_nodes,strategy_options,option_tuples,strategy_enumeration_s,unique_ilp_vars,logical_decision_vars,cluster_copied_decision_vars,decision_var_build_s,constraints,solve_s,status
-1B,3,4x4x4,1.2358144,4140,662279,181062856,459.509,20390366,181062856,160672490,462.310,,,timeout_before_constraints
-1B,4,4x4x2x2,1.2358144,,,,,,,,,,,not_run
diff --git a/profile_results/real_llama3_timeouts.csv b/profile_results/real_llama3_timeouts.csv
deleted file mode 100644
index c3e6c843..00000000
--- a/profile_results/real_llama3_timeouts.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-model_key,mesh_ndim,mesh_shape,timeout_s,result
-1B,3,4x4x4,1200,timeout_after_decision_vars_before_constraints
-1B,4,4x4x2x2,,not_run
diff --git a/qwen3_8b_autoparallel_30steps.log b/qwen3_8b_autoparallel_30steps.log
deleted file mode 120000
index 5cc45d55..00000000
--- a/qwen3_8b_autoparallel_30steps.log
+++ /dev/null
@@ -1 +0,0 @@
-/tmp/qwen3_8b_autoparallel_30steps.log
\ No newline at end of file
diff --git a/tests/test_dp_solver.py b/tests/test_dp_solver.py
new file mode 100644
index 00000000..3dbb2d10
--- /dev/null
+++ b/tests/test_dp_solver.py
@@ -0,0 +1,158 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import operator
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from autoparallel.graph_passes.graph_utils import all_input_nodes
+from autoparallel.optimize_sharding import DPBasedShardingSolver
+
+
+class _FakeOptimizer:
+    def __init__(self, graph):
+        self.graph = graph
+        self.strats = {node: object() for node in graph.nodes}
+        self.nodes = list(self.strats.keys())
+
+    def _all_input_nodes(self, node):
+        return [
+            input_node
+            for input_node in all_input_nodes(node)
+            if input_node in self.strats
+        ]
+
+
+def _assert_predecessors_match_graph_indegrees(topology):
+    topology_nodes = set(topology.nodes)
+    assert set(topology.predecessors) == topology_nodes
+    assert set(topology.node_to_index) == topology_nodes
+
+    for node in topology.nodes:
+        expected_predecessors = [
+            input_node
+            for input_node in all_input_nodes(node)
+            if input_node in topology_nodes
+        ]
+        predecessors = topology.predecessors[node]
+        assert len(predecessors) == len(expected_predecessors)
+        assert predecessors == expected_predecessors
+
+
+def test_dp_solver_builds_topological_order_for_merge_graph():
+    class MergeModule(torch.nn.Module):
+        def forward(self, x, y):
+            a = x + y
+            b = x * 2
+            return a + b
+
+    graph = torch.fx.symbolic_trace(MergeModule()).graph
+    solver = DPBasedShardingSolver(_FakeOptimizer(graph))
+
+    topology = solver.build_topological_order()
+
+    assert all(node.op != "output" for node in topology.nodes)
+    assert topology.nodes == [node for node in graph.nodes if node.op != "output"]
+    _assert_predecessors_match_graph_indegrees(topology)
+
+    for node, predecessors in topology.predecessors.items():
+        node_index = topology.node_to_index[node]
+        for pred in predecessors:
+            assert topology.node_to_index[pred] < node_index
+
+    merge = topology.nodes[-1]
+    assert [pred.name for pred in topology.predecessors[merge]] == ["add", "mul"]
+
+
+def test_dp_solver_preserves_duplicate_predecessors():
+    class DuplicateInputModule(torch.nn.Module):
+        def forward(self, x):
+            return x + x
+
+    graph = torch.fx.symbolic_trace(DuplicateInputModule()).graph
+    solver = DPBasedShardingSolver(_FakeOptimizer(graph))
+
+    topology = solver.build_topological_order()
+    _assert_predecessors_match_graph_indegrees(topology)
+
+    add_node = next(node for node in topology.nodes if node.op == "call_function")
+    predecessors = topology.predecessors[add_node]
+    assert len(predecessors) == 2
+    assert predecessors[0] is predecessors[1]
+    assert predecessors[0].name == "x"
+
+
+def test_dp_solver_topology_for_tiny_transformer_forward():
+    class TinyTransformerBlock(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.q = torch.nn.Linear(8, 8)
+            self.k = torch.nn.Linear(8, 8)
+            self.v = torch.nn.Linear(8, 8)
+            self.o = torch.nn.Linear(8, 8)
+            self.ff1 = torch.nn.Linear(8, 16)
+            self.ff2 = torch.nn.Linear(16, 8)
+
+        def forward(self, x):
+            q = self.q(x)
+            k = self.k(x)
+            v = self.v(x)
+            scores = q @ k.transpose(-2, -1) / math.sqrt(8)
+            attn = F.softmax(scores, dim=-1)
+            attn_out = attn @ v
+            x = x + self.o(attn_out)
+            hidden = F.relu(self.ff1(x))
+            return x + self.ff2(hidden)
+
+    block = TinyTransformerBlock()
+    assert block(torch.randn(2, 4, 8)).shape == (2, 4, 8)
+
+    graph = torch.fx.symbolic_trace(block).graph
+    solver = DPBasedShardingSolver(_FakeOptimizer(graph))
+
+    topology = solver.build_topological_order()
+    _assert_predecessors_match_graph_indegrees(topology)
+    node_names = [node.name for node in topology.nodes]
+
+    assert node_names == [
+        "x",
+        "q",
+        "k",
+        "v",
+        "transpose",
+        "matmul",
+        "truediv",
+        "softmax",
+        "matmul_1",
+        "o",
+        "add",
+        "ff1",
+        "relu",
+        "ff2",
+        "add_1",
+    ]
+
+    add_nodes = [node for node in topology.nodes if node.target is operator.add]
+    assert [node.name for node in add_nodes] == ["add", "add_1"]
+    assert [pred.name for pred in topology.predecessors[add_nodes[0]]] == ["x", "o"]
+    assert [pred.name for pred in topology.predecessors[add_nodes[1]]] == [
+        "add",
+        "ff2",
+    ]
+
+
+def test_dp_solver_solution_is_not_implemented():
+    class SimpleModule(torch.nn.Module):
+        def forward(self, x):
+            return x + 1
+
+    graph = torch.fx.symbolic_trace(SimpleModule()).graph
+    solver = DPBasedShardingSolver(_FakeOptimizer(graph))
+
+    with pytest.raises(NotImplementedError, match="only builds topological order"):
+        solver.get_solution()
diff --git a/tests/test_lp_relaxation.py b/tests/test_lp_relaxation.py
new file mode 100644
index 00000000..1b03e6fe
--- /dev/null
+++ b/tests/test_lp_relaxation.py
@@ -0,0 +1,103 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import pulp
+import pytest
+import torch
+from conftest import apply_cuda_patches
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+
+
+def _fake_dp4_tp4_mesh():
+    return torch.distributed.device_mesh.init_device_mesh(
+        "cuda",
+        (4, 4),
+        mesh_dim_names=("dp", "tp"),
+    )
+
+
+def _llama3_example_autop(device_mesh):
+    vocab_size = 128
+    seq_len = 16
+    batch_size = 2 * device_mesh.shape[0]
+    model_args = TransformerModelArgs(
+        dim=64,
+        n_layers=1,
+        n_heads=4,
+        n_kv_heads=2,
+        vocab_size=vocab_size,
+        multiple_of=32,
+        rope_theta=500000,
+        max_seq_len=seq_len,
+    )
+    with torch.device("meta"):
+        model = Transformer(model_args)
+
+    def input_fn():
+        return torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda")
+
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16,
+        reduce_dtype=torch.float32,
+    )
+    return AutoParallel(
+        model,
+        input_fn,
+        device_mesh,
+        mp_policy,
+        repeated_subgraphs=True,
+    )
+
+
+@apply_cuda_patches
+@pytest.mark.filterwarnings("ignore:Constructing LpVariable")
+@pytest.mark.filterwarnings("ignore:Using LpProblem.constraints")
+def test_lp_relaxation_certifies_llama3_example_search():
+    mesh = _fake_dp4_tp4_mesh()
+    with _llama3_example_autop(mesh) as autop:
+        autop.add_parameter_memory_constraint(low=None, high=None)
+        x_sharding = (Shard(0), Replicate())
+        out_sharding = (Shard(0), Shard(2))
+        autop.add_input_constraints([x_sharding])
+        autop.add_output_constraints([out_sharding])
+
+        opt = autop.sharding_optimizer
+
+        binary_vars = list(opt.pulp_variables.values())
+        assert binary_vars
+        assert all(var.cat == pulp.LpInteger for var in binary_vars)
+        assert all(var.lowBound == 0 and var.upBound == 1 for var in binary_vars)
+
+        continuous_vars = opt._create_pulp_variables(pulp.LpContinuous)
+        assert continuous_vars
+        assert all(var.cat == pulp.LpContinuous for var in continuous_vars.values())
+        assert all(
+            var.lowBound == 0 and var.upBound == 1 for var in continuous_vars.values()
+        )
+
+        lower_bound = opt.get_lower_bound()
+        assert lower_bound.status == "Optimal"
+        assert math.isfinite(lower_bound.objective)
+        assert lower_bound.objective >= 0
+
+        assert not hasattr(opt, "selected_keys")
+        assert opt.prob.objective is None
+        assert all(var.cat == pulp.LpInteger for var in opt.pulp_variables.values())
+
+        solution = opt.get_solution()
+        feasible_cost = pulp.value(opt.prob.objective)
+        certificate_gap = (
+            feasible_cost - lower_bound.objective
+        ) / lower_bound.objective
+        assert solution
+        assert lower_bound.objective <= feasible_cost + 1e-5
+        assert certificate_gap >= -1e-8
+        assert math.isfinite(certificate_gap)

From ad7ee80972acfd1994e8ff11ec0564d4d479273b Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Fri, 29 May 2026 15:48:49 -0700
Subject: [PATCH 04/27] Checkpoint scratch LP benchmark and ignore reference
 PDFs

Snapshot the current working tree before adding the approximate sharding
solver. Tracks the scratch _bench_lp_3d.py benchmark and adds *.pdf to
.gitignore so reference papers stay out of git history.

Authored with Claude.
---
 .gitignore               |   2 +
 examples/_bench_lp_3d.py | 107 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 examples/_bench_lp_3d.py

diff --git a/.gitignore b/.gitignore
index ff4f7532..1a6228f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,8 @@
 .mypy_cache/
 *.egg-info/
 
+*.pdf
+
 build/
 dist/
 tmp/
diff --git a/examples/_bench_lp_3d.py b/examples/_bench_lp_3d.py
new file mode 100644
index 00000000..5b08840b
--- /dev/null
+++ b/examples/_bench_lp_3d.py
@@ -0,0 +1,107 @@
+"""Benchmark LP-relaxation solve time for LLaMA3 on a 3D mesh."""
+import logging
+import os
+import time
+
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.WARNING)
+
+MODEL_TYPE = os.environ.get("MODEL_TYPE", "8b")
+N_LAYERS = int(os.environ.get("N_LAYERS", "0"))  # 0 => use default for model
+SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4)))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
+MESH_NAMES = ("dp", "cp", "tp")
+
+world_size = 1
+for d in MESH_SHAPE:
+    world_size *= d
+
+fake_store = FakeStore()
+torch.distributed.init_process_group("fake", store=fake_store, rank=0, world_size=world_size)
+
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=MESH_NAMES)
+
+batch_size = 2 * mesh.shape[0]
+seqlen = SEQLEN
+vocab_size = 128256
+device = torch.device("cuda")
+
+
+def model_fn():
+    if MODEL_TYPE == "1b":
+        args = TransformerModelArgs(
+            dim=2048, n_layers=16, n_heads=32, n_kv_heads=8,
+            ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000,
+            vocab_size=vocab_size, max_seq_len=seqlen,
+        )
+    elif MODEL_TYPE == "8b":
+        args = TransformerModelArgs(
+            dim=4096, n_layers=32, n_heads=32, n_kv_heads=8,
+            ffn_dim_multiplier=1.3, multiple_of=1024, rope_theta=500000,
+            vocab_size=vocab_size, max_seq_len=seqlen,
+        )
+    elif MODEL_TYPE == "70b":
+        args = TransformerModelArgs(
+            dim=8192, n_layers=80, n_heads=64, n_kv_heads=8,
+            ffn_dim_multiplier=1.3, multiple_of=4096, rope_theta=500000,
+            vocab_size=vocab_size, max_seq_len=seqlen,
+        )
+    else:
+        raise ValueError(MODEL_TYPE)
+    if N_LAYERS:
+        args.n_layers = N_LAYERS
+    return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, seqlen), device=device)
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+
+with torch.device("meta"):
+    model = model_fn()
+
+mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+
+print(f"=== model={MODEL_TYPE} n_layers={model.model_args.n_layers} "
+      f"mesh={MESH_SHAPE}{MESH_NAMES} world_size={world_size} ===")
+
+print("[build] entering AutoParallel (graph export + strategy enumeration)...", flush=True)
+t_build = time.perf_counter()
+with AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True) as autop:
+    print(f"[build] AutoParallel ready in {time.perf_counter() - t_build:.2f} s", flush=True)
+    autop.add_parameter_memory_constraint(low=None, high=None)
+    x_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1)
+    autop.add_input_constraints([x_sharding])
+    autop.add_output_constraints([x_sharding])
+    print(f"[build+constraints] {time.perf_counter() - t_build:.2f} s")
+
+    opt = autop.sharding_optimizer
+    print(f"[problem] unique_vars={len(opt.pulp_variables)} "
+          f"constraints={len(opt.prob.constraints)}", flush=True)
+
+    mode = os.environ.get("SOLVE_MODE", "lp")  # lp | ilp | both
+
+    if mode in ("lp", "both"):
+        res = opt.get_lower_bound(verbose=False)
+        print(f"[LP relaxation] status={res.status} objective={res.objective:.4f}")
+        print(f"[LP relaxation] solve_s={res.solve_s:.3f}  total_s={res.total_s:.3f}", flush=True)
+
+    if mode in ("ilp", "both"):
+        print("[ILP] solving (this may take a long time)...", flush=True)
+        t_ilp = time.perf_counter()
+        opt.get_solution(verbose=True)
+        import pulp
+        obj = pulp.value(opt.prob.objective)
+        print(f"[ILP] status={pulp.LpStatus[opt.prob.status]} objective={obj}")
+        print(f"[ILP] solve+extract_s={time.perf_counter() - t_ilp:.3f}", flush=True)

From 6613928a8320b02c982f6a62b17102615706a693 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sat, 30 May 2026 10:51:28 -0700
Subject: [PATCH 05/27] Add approximate belief-propagation sharding solver

Adds a heuristic alternative to the ILP for the placement problem,
formulated as pairwise MRF energy minimization on the strategy DAG and
solved with a sequential min-sum belief propagation over coupled groups,
followed by coordinate-descent and star-block local search. The energy is
an exact transcription of the ILP objective, so the assignment is scored
identically and the gap is small (LP-certified within ~3-8% on LLaMA3 1B),
while the solve runs ~10x faster than CBC and works on 3D meshes where the
ILP is intractable. Exposed via optimize_placement(solver="approx").

Review order: optimize_sharding.py (idempotent _set_objective) and api.py
(solver dispatch) are the integration points; approximate_sharding.py is the
solver; test_approximate_sharding.py checks the objective gap, energy
faithfulness, and flow feasibility against the ILP.

Authored with Claude.
---
 autoparallel/api.py                  |   23 +-
 autoparallel/approximate_sharding.py | 1058 ++++++++++++++++++++++++++
 autoparallel/optimize_sharding.py    |   10 +-
 examples/_bench_approx.py            |  166 ++++
 tests/test_approximate_sharding.py   |  140 ++++
 5 files changed, 1394 insertions(+), 3 deletions(-)
 create mode 100644 autoparallel/approximate_sharding.py
 create mode 100644 examples/_bench_approx.py
 create mode 100644 tests/test_approximate_sharding.py

diff --git a/autoparallel/api.py b/autoparallel/api.py
index 1670d509..907d6111 100644
--- a/autoparallel/api.py
+++ b/autoparallel/api.py
@@ -356,10 +356,29 @@ def add_output_constraints(self, constraints):
         self.sharding_optimizer.add_sharded_output_constraint(constraints)
         self.output_constraints = constraints
 
-    def optimize_placement(self, verbose=True):
+    def optimize_placement(self, verbose=True, solver="ilp", approximate_options=None):
+        """Solve for the optimal placement.
+
+        solver="ilp" (default) uses the exact PuLP/CBC solver. solver="approx"
+        uses the heuristic ApproximateShardingSolver, which trades a small
+        objective gap for a much faster solve. approximate_options is forwarded
+        as kwargs to the approximate solver (e.g. candidate_limit, max_sweeps).
+        """
         self._assert_entered()
 
-        self.sharding_placement = self.sharding_optimizer.get_solution(verbose=False)
+        if solver in ("approx", "approximate"):
+            from .approximate_sharding import ApproximateShardingSolver
+
+            approx = ApproximateShardingSolver(
+                self.sharding_optimizer, **(approximate_options or {})
+            )
+            self.sharding_placement = approx.get_solution(verbose=verbose)
+        elif solver == "ilp":
+            self.sharding_placement = self.sharding_optimizer.get_solution(
+                verbose=False
+            )
+        else:
+            raise ValueError(f"Unknown solver={solver!r}; expected 'ilp' or 'approx'")
 
         if verbose:
             logger.info(self.sharding_optimizer.get_log(verbose=True))
diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py
new file mode 100644
index 00000000..aba6111c
--- /dev/null
+++ b/autoparallel/approximate_sharding.py
@@ -0,0 +1,1058 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Approximate sharding solver.
+
+The ILP in :mod:`optimize_sharding` selects, for every operation, an output
+placement and (per argument) the input placement of its producer. The flow
+constraint forces a consumer's input placement to equal its producer's chosen
+output placement, so the only genuinely free variables are the per-node output
+strategy indices ``x_v``. The problem therefore reduces to a pairwise discrete
+energy minimization over a DAG::
+
+    E(x) = Σ_v U_v(x_v) + Σ_{(u,v)} B_{uv}(x_u, x_v)
+
+where ``U_v`` is the compute cost and ``B_{uv}`` is the communication +
+sharding-transition cost on the edge from producer ``u`` to consumer ``v``.
+
+This is a pairwise MRF. The autograd DAG has small in-degree (<3) but large
+out-degree (tens) and a wide topological frontier (hundreds), so exact
+frontier/junction-tree DP blows up. We instead solve it with **min-sum belief
+propagation** (max-product in min-sum form) on the graph of *coupled groups*,
+which propagates coordinated decisions globally, then polish with group-level
+coordinate descent and a star-block local search.
+
+Nodes that must be chosen jointly are merged into groups: repeated-subgraph
+cluster copies share a strategy index, and forward/backward pairs share an
+output placement. The solver reuses the strategies, decision variables and
+constraints already built by ``ShardingOptimizer`` (it replaces only the
+CBC/ILP *solve*, not problem construction) and writes its assignment back into
+the PuLP variables, so the result is scored with the exact same objective as the
+ILP (``pulp.value(prob.objective)``).
+"""
+
+import logging
+import math
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+import numpy as np
+import pulp
+
+from .cost_models.compute_estimation import _get_sharded_shape_stride
+
+logger = logging.getLogger(__name__)
+
+INF = float("inf")
+BIG = 1e12  # finite stand-in for forbidden combinations (avoids NaN in min-sum)
+
+# Paired forward/backward constraints couple two nodes to the *same output
+# placement* (the strategy index may differ between the two strategy lists).
+_PAIRED_PREFIXES = (
+    "grad_param_constraint",
+    "grad_input_constraint",
+    "grad_output_constraint",
+)
+
+
+@dataclass
+class ApproximateSolveResult:
+    objective: float
+    status: str
+    build_s: float
+    solve_s: float
+    total_s: float
+    num_groups: int
+    num_nodes: int
+
+
+@dataclass
+class _Group:
+    """A set of node indices chosen jointly (cluster copies share a strategy
+    index; forward/backward pairs share an output placement)."""
+
+    members: list[int]
+    cost_bearing: list[int] = field(default_factory=list)
+    choices: list[dict[int, int]] = field(default_factory=list)  # member -> out_idx
+    current: int = 0
+
+    @property
+    def domain(self) -> int:
+        return len(self.choices)
+
+
+class _UnionFind:
+    def __init__(self, n: int):
+        self.parent = list(range(n))
+
+    def find(self, x: int) -> int:
+        root = x
+        while self.parent[root] != root:
+            root = self.parent[root]
+        while self.parent[x] != root:
+            self.parent[x], x = root, self.parent[x]
+        return root
+
+    def union(self, a: int, b: int) -> None:
+        ra, rb = self.find(a), self.find(b)
+        if ra != rb:
+            self.parent[rb] = ra
+
+
+class ApproximateShardingSolver:
+    """Approximate solver for the sharding placement problem on an already-built
+    :class:`ShardingOptimizer`.
+
+    Call :meth:`get_solution` for a ``{node: OpSpec}`` dict (same format as
+    ``ShardingOptimizer.get_solution``); it also fills the PuLP variables and
+    ``optimizer.selected_keys`` so the assignment can be scored/inspected exactly
+    like an ILP solution.
+    """
+
+    def __init__(
+        self,
+        optimizer,
+        candidate_limit: Optional[int] = 64,
+        bp_iters: int = 20,
+        bp_damping: float = 0.2,
+        bp_tol: float = 1e-3,
+        max_sweeps: int = 12,
+        max_time_s: float = 60.0,
+        star_passes: int = 2,
+        max_star_children: int = 32,
+        group_domain_limit: int = 512,
+    ):
+        self.opt = optimizer
+        self.candidate_limit = candidate_limit
+        self.bp_iters = bp_iters
+        self.bp_damping = bp_damping
+        self.bp_tol = bp_tol
+        self.max_sweeps = max_sweeps
+        self.max_time_s = max_time_s
+        self.star_passes = star_passes
+        self.max_star_children = max_star_children
+        self.group_domain_limit = group_domain_limit
+
+        # Populated by _build_problem().
+        self.cost_bearing: list[int] = []
+        self.node_mult: dict[int, int] = {}
+        self.forbidden: set[tuple] = set()
+        self.allowed_out: dict[int, list[int]] = {}
+        self.groups: list[_Group] = []
+        self.node_to_group: dict[int, int] = {}
+        self.input_edges: dict[int, list[tuple[int, int]]] = {}
+        self._arg_prod: dict[int, dict[int, int]] = {}
+        self.consumers: dict[int, list[tuple[int, int]]] = defaultdict(list)
+        self.cur_out: dict[int, int] = {}
+        self._memory: Optional[dict[str, Any]] = None
+
+        # Populated by _build_factors().
+        self.g_unary: list[np.ndarray] = []
+        self.C: dict[tuple, np.ndarray] = {}
+        self.nbrs: list[list[int]] = []
+
+    # ------------------------------------------------------------------ #
+    # Public entry point
+    # ------------------------------------------------------------------ #
+    def get_solution(self, verbose: bool = False):
+        result, solution = self._solve(verbose=verbose)
+        self.result = result
+        return solution
+
+    def _solve(self, verbose: bool = False):
+        opt = self.opt
+        if getattr(opt, "solver_backend", "ilp") != "ilp":
+            raise RuntimeError(
+                "ApproximateShardingSolver requires an ILP-built optimizer "
+                "(decision_vars / pulp_variables / constraints)."
+            )
+        t0 = time.perf_counter()
+        self._build_problem()
+        t_bp = time.perf_counter()
+        self._build_factors()
+        t_bf = time.perf_counter()
+        t_build = t_bf - t0
+        if verbose:
+            logger.info(
+                "approx build: problem=%.2fs %s factors=%.2fs groups=%d "
+                "cost_bearing=%d edges=%d max_domain=%d",
+                t_bp - t0, getattr(self, "_build_times", {}), t_bf - t_bp,
+                len(self.groups), len(self.cost_bearing),
+                sum(len(v) for v in self.input_edges.values()),
+                max((g.domain for g in self.groups), default=0),
+            )
+
+        deadline = t0 + self.max_time_s
+        # Candidate 1: belief propagation init.
+        t_bp0 = time.perf_counter()
+        self._belief_propagation()
+        if verbose:
+            logger.info("approx phase: bp converged iter=%s delta=%.4g in %.2fs; "
+                        "bp_decode energy=%.1f",
+                        getattr(self, "_bp_last_iter", None),
+                        getattr(self, "_bp_last_delta", float("nan")),
+                        time.perf_counter() - t_bp0,
+                        self._fast_total_energy())
+        self._memory_repair()
+        self._coordinate_descent(deadline)
+        if verbose:
+            logger.info("approx phase: bp+cd energy=%.1f", self._fast_total_energy())
+        self._star_block_search(deadline)
+        bp_energy = self._fast_total_energy()
+        bp_snapshot = [g.current for g in self.groups]
+        if verbose:
+            logger.info("approx phase: bp+cd+star energy=%.1f", bp_energy)
+
+        # Candidate 2: greedy init (cheap insurance against BP doing poorly).
+        self._greedy_init()
+        self._memory_repair()
+        self._coordinate_descent(deadline)
+        self._star_block_search(deadline)
+        greedy_energy = self._fast_total_energy()
+        if verbose:
+            logger.info("approx phase: greedy+cd+star energy=%.1f", greedy_energy)
+
+        if bp_energy <= greedy_energy:
+            for gid, ci in enumerate(bp_snapshot):
+                self._set_group(gid, ci)
+        t_solve = time.perf_counter() - t0 - t_build
+
+        objective = self._write_back()
+        total_s = time.perf_counter() - t0
+        infeasible = not math.isfinite(objective)
+        status = "Infeasible" if infeasible else "Heuristic"
+        result = ApproximateSolveResult(
+            objective=objective,
+            status=status,
+            build_s=t_build,
+            solve_s=t_solve,
+            total_s=total_s,
+            num_groups=len(self.groups),
+            num_nodes=len(self.cost_bearing),
+        )
+        logger.info(
+            "ApproximateShardingSolver: status=%s objective=%.4f "
+            "(bp=%.1f greedy=%.1f) groups=%d nodes=%d "
+            "timings={build=%.3fs,solve=%.3fs,total=%.3fs}",
+            status,
+            objective,
+            bp_energy,
+            greedy_energy,
+            len(self.groups),
+            len(self.cost_bearing),
+            t_build,
+            t_solve,
+            total_s,
+        )
+        opt.profile["approximate"] = {
+            "objective": objective,
+            "status": status,
+            "build_s": t_build,
+            "solve_s": t_solve,
+            "total_s": total_s,
+            "groups": len(self.groups),
+            "bp_energy": bp_energy,
+            "greedy_energy": greedy_energy,
+        }
+        if infeasible:
+            raise RuntimeError(
+                "ApproximateShardingSolver could not find a feasible assignment. "
+                "User constraints may be contradictory or the mesh too small."
+            )
+        solution = opt._to_orig_solution(opt._extract_and_validate_solution())
+        return result, solution
+
+    # ------------------------------------------------------------------ #
+    # Problem construction
+    # ------------------------------------------------------------------ #
+    def _build_problem(self):
+        opt = self.opt
+        cluster_linked = {key[0] for key in opt.cluster_links}
+        self.cost_bearing = [
+            opt.node_map[node]
+            for node in opt.strats
+            if node.op != "output" and opt.node_map[node] not in cluster_linked
+        ]
+
+        root_to_copies: dict[int, set] = defaultdict(set)
+        for linked_key, root_key in opt.cluster_links.items():
+            root_to_copies[root_key[0]].add(linked_key[0])
+        self.node_mult = {
+            v: 1 + len(root_to_copies.get(v, ())) for v in self.cost_bearing
+        }
+
+        self.allowed_out = {}
+        for node, strat in opt.strats.items():
+            if node.op == "output":
+                continue
+            self.allowed_out[opt.node_map[node]] = list(range(len(strat.strategies)))
+
+        t = time.perf_counter()
+        paired_edges, authoritative = self._parse_constraints()
+        # Flow edges are taken from the ILP's output_input_consistent constraints
+        # (the authoritative producer per consumer-arg), NOT from _all_input_nodes:
+        # the two disagree for some ops (einsum list-args, alias/backward nodes),
+        # and trusting _all_input_nodes yields flow-infeasible assignments. The
+        # producer here is the (possibly cluster-resolved) node carrying the
+        # producer's pulp variable; the ILP guarantees its out_idx range matches
+        # the consumer's inp_idx range for that arg.
+        self._arg_prod: dict[int, dict[int, int]] = defaultdict(dict)
+        flow_couplings = []  # producer sets forced to share an out_idx
+        for (c_idx, argi), producers in authoritative.items():
+            rep = min(producers)  # all coupled -> same out, any representative
+            self._arg_prod[c_idx][argi] = rep
+            if len(producers) > 1:
+                flow_couplings.append(producers)
+        self.input_edges = {}
+        self.consumers = defaultdict(list)
+        for v in self.cost_bearing:
+            edges = sorted(self._arg_prod.get(v, {}).items())
+            self.input_edges[v] = edges
+            for argi, p in edges:
+                self.consumers[p].append((v, argi))
+        t_parse = time.perf_counter()
+
+        # Remove fully-forbidden out_idx for cost-bearing nodes.
+        for v in self.cost_bearing:
+            node = opt.nodes[v]
+            self.allowed_out[v] = [
+                o for o in self.allowed_out[v] if not self._out_fully_forbidden(v, node, o)
+            ]
+        t_forbid = time.perf_counter()
+
+        self._build_memory_info()  # also pins params when the budget is tight
+        t_mem = time.perf_counter()
+        self._build_groups(paired_edges, flow_couplings)
+        t_groups = time.perf_counter()
+        self._prune_candidates()
+        self._build_times = {
+            "parse": t_parse - t,
+            "forbid": t_forbid - t_parse,
+            "memory": t_mem - t_forbid,
+            "groups": t_groups - t_mem,
+            "prune": time.perf_counter() - t_groups,
+        }
+
+    # Constraint families that never restrict the per-node out_idx domain and
+    # are handled structurally (flow/uniqueness) or via the cost sentinel below.
+    # Skipping them by name avoids materializing items() for the ~majority of the
+    # (often >100k) constraints.
+    _SKIP_PREFIXES = (
+        "unique_decision",
+        "same_across_args",
+        "inf_cases",
+        "memory_constraint",
+    )
+
+    def _parse_constraints(self):
+        opt = self.opt
+        # inf-cost keys are forced to 0 by add_inf_cost_constraint, which also
+        # stamps dv.cost = 10000.0. Detect them directly instead of parsing the
+        # (very numerous) inf_cases constraints.
+        for key, dv in opt.decision_vars.items():
+            if dv.cost == 10000.0:
+                self.forbidden.add(key)
+
+        var_to_key = {var: key for key, var in opt.pulp_variables.items()}
+        restrict: dict[int, set] = {}
+        paired_edges: list[tuple[int, int, frozenset]] = []
+        # (consumer_idx, argi) -> set of producer_idx, from flow constraints. A
+        # clustered consumer's single inp variable is shared across all its
+        # copies, so the ILP couples one producer per copy (resolved to its root)
+        # to that inp, forcing them all equal; we collect the whole set.
+        authoritative: dict[tuple[int, int], set] = {}
+        for name, c in opt.prob.constraints.items():
+            if name.startswith("output_input_consistent"):
+                # +side = producer (grouped by out), -side = consumer (grouped by
+                # inp at a fixed arg). One +var and one -var pin down the edge.
+                pos_key = neg_key = None
+                for var, coeff in c.items():
+                    k = var_to_key.get(var)
+                    if k is None:
+                        continue
+                    if coeff > 0:
+                        pos_key = pos_key or k
+                    else:
+                        neg_key = neg_key or k
+                    if pos_key is not None and neg_key is not None:
+                        break
+                if pos_key is not None and neg_key is not None:
+                    authoritative.setdefault(
+                        (neg_key[0], neg_key[1]), set()
+                    ).add(pos_key[0])
+                continue
+            if name.startswith(self._SKIP_PREFIXES):
+                continue
+            items = list(c.items())
+            if not items:
+                continue
+            rhs = -c.constant
+            coeffs = [coeff for _, coeff in items]
+            keys = [var_to_key.get(var) for var, _ in items]
+            if any(k is None for k in keys):
+                continue
+            all_pos = all(coeff > 0 for coeff in coeffs)
+            if c.sense == pulp.LpConstraintEQ and rhs == 0 and all_pos:
+                self.forbidden.update(keys)  # Σ vars == 0  (inf / dtype / disable)
+            elif c.sense == pulp.LpConstraintEQ and rhs == 1 and all_pos:
+                nodes = {k[0] for k in keys}
+                if len(nodes) == 1:
+                    n = next(iter(nodes))
+                    out_set = {k[2] for k in keys}
+                    restrict[n] = restrict.get(n, out_set) & out_set
+            elif (
+                c.sense == pulp.LpConstraintEQ
+                and rhs == 0
+                and any(name.startswith(p) for p in _PAIRED_PREFIXES)
+                and "disable" not in name
+            ):
+                pos = {k for k, coeff in zip(keys, coeffs) if coeff > 0}
+                neg = {k for k, coeff in zip(keys, coeffs) if coeff < 0}
+                na, nb = {k[0] for k in neg}, {k[0] for k in pos}
+                oa, ob = {k[2] for k in neg}, {k[2] for k in pos}
+                if len(na) == 1 and len(nb) == 1 and len(oa) == 1 and len(ob) == 1:
+                    paired_edges.append(
+                        (next(iter(na)), next(iter(nb)),
+                         frozenset({(next(iter(oa)), next(iter(ob)))}))
+                    )
+        for n, out_set in restrict.items():
+            if n in self.allowed_out:
+                self.allowed_out[n] = [o for o in self.allowed_out[n] if o in out_set]
+        return paired_edges, authoritative
+
+    def _out_fully_forbidden(self, v, node, o):
+        strat = self.opt.strats[node].strategies[o]
+        for argi, costs in enumerate(strat.redistribute_cost):
+            if all((v, argi, o, inp) in self.forbidden for inp in range(len(costs))):
+                return True
+        return False
+
+    def _build_groups(self, paired_edges, flow_couplings):
+        opt = self.opt
+        n = len(opt.nodes)
+        uf = _UnionFind(n)
+        # cluster_links has one entry per option-key; collapse to unique
+        # (linked_node, root_node) pairs so the K-scaled loops below run over
+        # hundreds of pairs, not millions of duplicates.
+        cluster_pairs = {(lk[0], rk[0]) for lk, rk in opt.cluster_links.items()}
+        for li, ri in cluster_pairs:
+            uf.union(li, ri)
+        for a, b, _ in paired_edges:
+            uf.union(a, b)
+
+        allow: dict[tuple, dict[int, set]] = defaultdict(lambda: defaultdict(set))
+        adj: dict[int, set] = defaultdict(set)
+        for li, ri in cluster_pairs:
+            for o in self.allowed_out.get(ri, []):
+                allow[(ri, li)][o].add(o)
+            for o in self.allowed_out.get(li, []):
+                allow[(li, ri)][o].add(o)
+            adj[li].add(ri)
+            adj[ri].add(li)
+        for a, b, pairs in paired_edges:
+            for oa, ob in pairs:
+                allow[(a, b)][oa].add(ob)
+                allow[(b, a)][ob].add(oa)
+            adj[a].add(b)
+            adj[b].add(a)
+        # Flow couplings: producers feeding a clustered consumer's shared inp are
+        # forced to the same out_idx (same-index coupling, star to the rep).
+        for producers in flow_couplings:
+            ps = sorted(producers)
+            rep = ps[0]
+            for q in ps[1:]:
+                uf.union(rep, q)
+                for o in self.allowed_out.get(rep, []):
+                    allow[(rep, q)][o].add(o)
+                for o in self.allowed_out.get(q, []):
+                    allow[(q, rep)][o].add(o)
+                adj[rep].add(q)
+                adj[q].add(rep)
+
+        comps: dict[int, list[int]] = defaultdict(list)
+        for node in opt.strats:
+            if node.op == "output":
+                continue
+            v = opt.node_map[node]
+            comps[uf.find(v)].append(v)
+
+        cost_bearing_set = set(self.cost_bearing)
+        self.groups = []
+        self.node_to_group = {}
+        for members in comps.values():
+            members.sort()
+            group = _Group(members=members)
+            group.cost_bearing = [m for m in members if m in cost_bearing_set]
+            group.choices = self._enumerate_choices(members, allow, adj)
+            if not group.choices:
+                raise RuntimeError(
+                    f"No feasible joint choice for group {members}; "
+                    "constraints are contradictory."
+                )
+            gid = len(self.groups)
+            self.groups.append(group)
+            for m in members:
+                self.node_to_group[m] = gid
+
+    def _enumerate_choices(self, members, allow, adj):
+        if len(members) == 1:
+            v = members[0]
+            return [{v: o} for o in self.allowed_out.get(v, [])]
+        member_set = set(members)
+        # BFS order from a representative so every member after the first is
+        # adjacent to an already-assigned one; coupling then propagates
+        # deterministically (no spurious K-way branching that explodes the
+        # domain for large cluster+paired groups).
+        order = []
+        seen = set()
+        for start in members:
+            if start in seen:
+                continue
+            queue = [start]
+            seen.add(start)
+            while queue:
+                m = queue.pop(0)
+                order.append(m)
+                for nb in adj[m]:
+                    if nb in member_set and nb not in seen:
+                        seen.add(nb)
+                        queue.append(nb)
+        results: list[dict[int, int]] = []
+        limit = self.group_domain_limit
+
+        def candidates(m, assign):
+            cand = None
+            for nb in adj[m]:
+                if nb in assign and nb in member_set:
+                    allowed = allow[(nb, m)].get(assign[nb], set())
+                    cand = allowed if cand is None else (cand & allowed)
+            cand = set(self.allowed_out.get(m, [])) if cand is None else (
+                cand & set(self.allowed_out.get(m, [])))
+            return cand
+
+        def dfs(i, assign):
+            if len(results) >= limit:
+                return
+            if i == len(order):
+                results.append(dict(assign))
+                return
+            m = order[i]
+            for val in sorted(candidates(m, assign)):
+                assign[m] = val
+                dfs(i + 1, assign)
+                del assign[m]
+                if len(results) >= limit:
+                    return
+
+        dfs(0, {})
+        if len(results) >= limit:
+            logger.warning(
+                "Approximate solver: group of %d nodes hit group_domain_limit=%d.",
+                len(members), limit,
+            )
+        return results
+
+    def _prune_candidates(self):
+        if self.candidate_limit is None:
+            return
+        for group in self.groups:
+            if len(group.members) != 1 or len(group.choices) <= self.candidate_limit:
+                continue
+            v = group.members[0]
+            node = self.opt.nodes[v]
+            lbs = sorted(
+                (self._choice_lower_bound(v, node, c[v]), ci)
+                for ci, c in enumerate(group.choices)
+            )
+            keep = {ci for _, ci in lbs[: self.candidate_limit]}
+            group.choices = [group.choices[ci] for ci in sorted(keep)]
+
+    def _choice_lower_bound(self, v, node, o):
+        opt = self.opt
+        strat = opt.strats[node].strategies[o]
+        mult = self.node_mult[v]
+        lb = opt.decision_vars[(v, 0, o, 0)].compute_cost * len(strat.redistribute_cost)
+        lb *= mult
+        for argi, _p in self.input_edges.get(v, []):
+            best = INF
+            for inp in range(len(strat.redistribute_cost[argi])):
+                key = (v, argi, o, inp)
+                if key in self.forbidden:
+                    continue
+                dv = opt.decision_vars[key]
+                best = min(best, dv.comm_cost + dv.sharding_transition_cost)
+            if math.isfinite(best):
+                lb += mult * best
+        return lb
+
+    # ------------------------------------------------------------------ #
+    # Memory constraint (ratios, budget, tight-budget param pinning)
+    # ------------------------------------------------------------------ #
+    def _build_memory_info(self):
+        opt = self.opt
+        factors = None
+        for fname, kwargs in getattr(opt, "_constraint_log", []):
+            if fname == "add_parameter_memory_constraint":
+                factors = kwargs
+        if factors is None:
+            return
+        try:
+            from torch._functorch._aot_autograd.fx_utils import get_param_nodes
+
+            param_nodes = get_param_nodes(opt.graph)
+        except Exception:
+            return
+
+        low_f, high_f = factors["memory_factor_low"], factors["memory_factor_high"]
+        budget_low = budget_high = 0.0
+        param_idxs, ratios = [], {}
+        for node in param_nodes:
+            v = opt.node_map[node]
+            param_idxs.append(v)
+            r = {o: self._param_ratio(v, node, o) for o in self.allowed_out.get(v, [])}
+            ratios[v] = r
+            best = min(r.values())
+            budget_low += max(best, low_f)
+            budget_high += max(best, high_f)
+
+        tight = abs(budget_high - budget_low) < 1e-9
+        if tight:
+            # Σ ratio == Σ min(ratio) forces every param to a min-ratio choice.
+            for v in param_idxs:
+                r = ratios[v]
+                mn = min(r.values())
+                self.allowed_out[v] = [o for o in self.allowed_out[v]
+                                       if r[o] <= mn + 1e-12]
+        self._memory = {
+            "param_idxs": param_idxs,
+            "ratios": ratios,
+            "budget_low": budget_low,
+            "budget_high": budget_high,
+            "tight": tight,
+        }
+
+    def _param_ratio(self, v, node, o):
+        spec = self.opt.decision_vars[(v, 0, o, 0)].input_spec
+        new_shape, _ = _get_sharded_shape_stride(spec)
+        return math.prod(new_shape) / math.prod(spec.tensor_meta.shape)
+
+    # ------------------------------------------------------------------ #
+    # Factor graph (numpy unary + pairwise matrices over groups)
+    # ------------------------------------------------------------------ #
+    def _build_factors(self):
+        G = len(self.groups)
+        # per member, its out_idx across its group's choices
+        member_vals = []
+        for group in self.groups:
+            mv = {}
+            for m in group.cost_bearing:
+                mv[m] = np.array([c[m] for c in group.choices], dtype=np.int64)
+            # also predecessors that are non-cost-bearing but in this group
+            for m in group.members:
+                if m not in mv:
+                    mv[m] = np.array([c[m] for c in group.choices], dtype=np.int64)
+            member_vals.append(mv)
+
+        self.g_unary = [np.zeros(g.domain) for g in self.groups]
+        for gid, group in enumerate(self.groups):
+            for m in group.cost_bearing:
+                vals = member_vals[gid][m]
+                self.g_unary[gid] += self.node_mult[m] * self._self_cost_vec(m, vals)
+
+        C: dict[tuple, np.ndarray] = {}
+        nbr_set: list[set] = [set() for _ in range(G)]
+        for v in self.cost_bearing:
+            gv = self.node_to_group[v]
+            mult = self.node_mult[v]
+            for argi, p in self.input_edges[v]:
+                gp = self.node_to_group[p]
+                R = self._edge_matrix(v, argi, p)  # (Kv, Kp) raw, BIG if forbidden
+                av = member_vals[gv][v]
+                bp = member_vals[gp][p]
+                contrib = mult * R[np.ix_(av, bp)]  # (D_gv, D_gp)
+                if gv == gp:
+                    self.g_unary[gv] += np.diagonal(contrib)
+                else:
+                    a, b = (gv, gp) if gv < gp else (gp, gv)
+                    mat = contrib if gv < gp else contrib.T
+                    if (a, b) in C:
+                        C[(a, b)] += mat
+                    else:
+                        C[(a, b)] = mat.copy()
+                    nbr_set[a].add(b)
+                    nbr_set[b].add(a)
+        self.C = C
+        self.nbrs = [sorted(s) for s in nbr_set]
+
+    def _self_cost_vec(self, m, out_indices):
+        """Vectorized self-cost (compute + producer-less arg costs) for node m
+        over an array of out_idx."""
+        opt = self.opt
+        node = opt.nodes[m]
+        prod = self._arg_prod.get(m, {})
+        out = np.empty(len(out_indices))
+        for i, o in enumerate(out_indices):
+            strat = opt.strats[node].strategies[o]
+            n_args = len(strat.redistribute_cost)
+            dv0 = opt.decision_vars[(m, 0, o, 0)]
+            c = dv0.compute_cost * n_args
+            # Args with no flow edge (constructors / None-spec) are scored at
+            # inp=0 here; args with a producer are charged via the pairwise edges.
+            for argi in range(n_args):
+                if argi in prod:
+                    continue
+                key = (m, argi, o, 0)
+                if key in self.forbidden:
+                    c = BIG
+                    break
+                dv = opt.decision_vars[key]
+                c += dv.comm_cost + dv.sharding_transition_cost
+            out[i] = c
+        return out
+
+    def _edge_matrix(self, v, argi, p):
+        """Raw (Kv, Kp) edge cost matrix R[o_v][o_p] = comm + transition, BIG when
+        the (o_v, o_p) combination is forbidden. Only entries that can actually be
+        indexed by the group choices are filled; the rest are BIG."""
+        opt = self.opt
+        Kv = len(opt.strats[opt.nodes[v]].strategies)
+        Kp = len(opt.strats[opt.nodes[p]].strategies)
+        R = np.full((Kv, Kp), BIG)
+        gv = self.node_to_group[v]
+        gp = self.node_to_group[p]
+        ov_vals = sorted({c[v] for c in self.groups[gv].choices})
+        op_vals = sorted({c[p] for c in self.groups[gp].choices})
+        for ov in ov_vals:
+            for op in op_vals:
+                key = (v, argi, ov, op)
+                if key in self.forbidden:
+                    continue
+                dv = opt.decision_vars[key]
+                R[ov, op] = dv.comm_cost + dv.sharding_transition_cost
+        return R
+
+    def _pair_matrix(self, g, h):
+        """Pairwise cost oriented as (x_g, x_h)."""
+        if g < h:
+            return self.C[(g, h)]
+        return self.C[(h, g)].T
+
+    # ------------------------------------------------------------------ #
+    # Energy (fast, numpy)
+    # ------------------------------------------------------------------ #
+    def _fast_group_energy(self, gid, ci):
+        e = self.g_unary[gid][ci]
+        for h in self.nbrs[gid]:
+            ch = self.groups[h].current
+            e += self.C[(gid, h)][ci, ch] if gid < h else self.C[(h, gid)][ch, ci]
+        return e
+
+    def _fast_total_energy(self):
+        total = 0.0
+        for gid, g in enumerate(self.groups):
+            total += self.g_unary[gid][g.current]
+        for (a, b), mat in self.C.items():
+            total += mat[self.groups[a].current, self.groups[b].current]
+        return total
+
+    # ------------------------------------------------------------------ #
+    # Belief propagation (min-sum) + decode
+    # ------------------------------------------------------------------ #
+    def _belief_propagation(self):
+        """Sequential (forward-backward, topological) min-sum message passing.
+        Exact MAP on trees in one sweep; near-optimal on the near-tree transformer
+        graph in a few sweeps, far better than synchronous flooding."""
+        G = len(self.groups)
+        unary = self.g_unary
+        nbrs = self.nbrs
+        damp = self.bp_damping
+
+        order = sorted(range(G), key=lambda g: min(self.groups[g].members))
+        msg: dict[tuple, np.ndarray] = {}
+        for g in range(G):
+            for h in nbrs[g]:
+                msg[(g, h)] = np.zeros(len(unary[h]))
+
+        for sweep in range(self.bp_iters):
+            max_delta = 0.0
+            for direction in (order, order[::-1]):
+                for g in direction:
+                    if not nbrs[g]:
+                        continue
+                    in_sum = unary[g].copy()
+                    for k in nbrs[g]:
+                        in_sum += msg[(k, g)]
+                    for h in nbrs[g]:
+                        excl = in_sum - msg[(h, g)]
+                        P = self._pair_matrix(g, h)  # (D_g, D_h)
+                        m = (excl[:, None] + P).min(axis=0)
+                        m -= m.min()
+                        md = (1 - damp) * m + damp * msg[(g, h)]
+                        delta = np.abs(md - msg[(g, h)]).max()
+                        if delta > max_delta:
+                            max_delta = delta
+                        msg[(g, h)] = md
+            self._bp_last_iter = sweep + 1
+            self._bp_last_delta = max_delta
+            if max_delta < self.bp_tol:
+                break
+
+        self._decode(msg)
+
+    def _decode(self, msg):
+        """Sequential topological decode: fix each group to the argmin of its
+        belief conditioned on already-decoded neighbors (exact pairwise cost) and
+        BP messages for the rest. Produces a consistent, forbidden-avoiding
+        assignment, unlike independent argmin on a loopy graph."""
+        G = len(self.groups)
+        order = sorted(range(G), key=lambda g: min(self.groups[g].members))
+        decided: dict[int, int] = {}
+        for g in order:
+            b = self.g_unary[g].copy()
+            for h in self.nbrs[g]:
+                if h in decided:
+                    b = b + self._pair_matrix(g, h)[:, decided[h]]
+                else:
+                    b = b + msg[(h, g)]
+            ci = int(np.argmin(b))
+            decided[g] = ci
+            self._set_group(g, ci)
+
+    # ------------------------------------------------------------------ #
+    # Local search
+    # ------------------------------------------------------------------ #
+    def _set_group(self, gid, ci):
+        group = self.groups[gid]
+        group.current = ci
+        for m, o in group.choices[ci].items():
+            self.cur_out[m] = o
+
+    def _greedy_init(self):
+        order = sorted(range(len(self.groups)),
+                       key=lambda g: min(self.groups[g].members))
+        for gid in order:
+            self._set_group(gid, 0)
+        for gid in order:
+            best_i, best_e = 0, INF
+            for ci in range(self.groups[gid].domain):
+                e = self.g_unary[gid][ci]
+                for h in self.nbrs[gid]:
+                    if min(self.groups[h].members) < min(self.groups[gid].members):
+                        ch = self.groups[h].current
+                        e += (self.C[(gid, h)][ci, ch] if gid < h
+                              else self.C[(h, gid)][ch, ci])
+                if e < best_e:
+                    best_i, best_e = ci, e
+            self._set_group(gid, best_i)
+
+    def _coordinate_descent(self, deadline):
+        for _ in range(self.max_sweeps):
+            if time.perf_counter() > deadline:
+                break
+            improved = False
+            for gid in range(len(self.groups)):
+                if self.groups[gid].domain <= 1:
+                    continue
+                cur = self.groups[gid].current
+                best_i, best_e = cur, self._fast_group_energy(gid, cur)
+                for ci in range(self.groups[gid].domain):
+                    if ci == cur:
+                        continue
+                    e = self._fast_group_energy(gid, ci)
+                    if e < best_e - 1e-6 and self._memory_ok_after(gid, ci):
+                        best_i, best_e = ci, e
+                if best_i != cur:
+                    self._set_group(gid, best_i)
+                    improved = True
+            if not improved:
+                break
+
+    def _star_block_search(self, deadline):
+        ranked = sorted(
+            ((len(self.nbrs[g]), g) for g in range(len(self.groups))
+             if len(self.nbrs[g]) >= 2 and self.groups[g].domain > 1),
+            reverse=True,
+        )
+        for _ in range(self.star_passes):
+            if time.perf_counter() > deadline:
+                break
+            improved = False
+            for _deg, gid in ranked:
+                if time.perf_counter() > deadline:
+                    break
+                if self._optimize_star(gid):
+                    improved = True
+            if not improved:
+                break
+
+    def _optimize_star(self, gid):
+        children = [h for h in self.nbrs[gid] if self.groups[h].domain > 1]
+        child_costs = sorted(
+            ((self._fast_group_energy(h, self.groups[h].current), h) for h in children),
+            reverse=True,
+        )
+        child_ids = [h for _e, h in child_costs[: self.max_star_children]]
+        if not child_ids:
+            return False
+        block = [gid, *child_ids]
+        base = self._block_energy(block)
+        best_energy = base
+        best_center = self.groups[gid].current
+        best_children = {h: self.groups[h].current for h in child_ids}
+        for ci in range(self.groups[gid].domain):
+            self._set_group(gid, ci)
+            if not self._memory_ok_after(gid, ci):
+                continue
+            chosen = {}
+            for h in child_ids:
+                b_i, b_e = self.groups[h].current, INF
+                for hi in range(self.groups[h].domain):
+                    e = self._fast_group_energy(h, hi)
+                    if e < b_e:
+                        b_i, b_e = hi, e
+                self._set_group(h, b_i)
+                chosen[h] = b_i
+            energy = self._block_energy(block)
+            if energy < best_energy - 1e-6 and self._block_memory_ok():
+                best_energy = energy
+                best_center = ci
+                best_children = dict(chosen)
+        self._set_group(gid, best_center)
+        for h, hi in best_children.items():
+            self._set_group(h, hi)
+        return best_energy < base - 1e-6
+
+    def _block_energy(self, gids):
+        total = 0.0
+        seen_edges = set()
+        for g in gids:
+            total += self.g_unary[g][self.groups[g].current]
+            for h in self.nbrs[g]:
+                key = (g, h) if g < h else (h, g)
+                if key in seen_edges:
+                    continue
+                seen_edges.add(key)
+                a, b = key
+                total += self.C[key][self.groups[a].current, self.groups[b].current]
+        return total
+
+    # ------------------------------------------------------------------ #
+    # Memory repair
+    # ------------------------------------------------------------------ #
+    def _current_memory(self):
+        if self._memory is None:
+            return 0.0
+        return sum(self._memory["ratios"][v][self.cur_out[v]]
+                   for v in self._memory["param_idxs"])
+
+    def _memory_ok_after(self, gid, ci):
+        if self._memory is None or self._memory.get("tight"):
+            return True
+        ratios = self._memory["ratios"]
+        choice = self.groups[gid].choices[ci]
+        delta = sum(ratios[m][o] - ratios[m][self.cur_out[m]]
+                    for m, o in choice.items() if m in ratios)
+        mem = self._current_memory() + delta
+        return (self._memory["budget_low"] - 1e-6 <= mem
+                <= self._memory["budget_high"] + 1e-6)
+
+    def _block_memory_ok(self):
+        if self._memory is None or self._memory.get("tight"):
+            return True
+        mem = self._current_memory()
+        return (self._memory["budget_low"] - 1e-6 <= mem
+                <= self._memory["budget_high"] + 1e-6)
+
+    def _memory_repair(self):
+        if self._memory is None or self._memory.get("tight"):
+            return
+        low, high = self._memory["budget_low"], self._memory["budget_high"]
+        ratios = self._memory["ratios"]
+        param_groups = {self.node_to_group[v] for v in self._memory["param_idxs"]
+                        if v in self.node_to_group}
+        for _ in range(2 * max(1, len(param_groups))):
+            mem = self._current_memory()
+            if low - 1e-6 <= mem <= high + 1e-6:
+                return
+            over = mem > high
+            best = None
+            for gid in param_groups:
+                group = self.groups[gid]
+                cur_e = self._fast_group_energy(gid, group.current)
+                for ci in range(group.domain):
+                    if ci == group.current:
+                        continue
+                    choice = group.choices[ci]
+                    dmem = sum(ratios[m][choice[m]] - ratios[m][self.cur_out[m]]
+                               for m in choice if m in ratios)
+                    if (dmem < -1e-9) != over and abs(dmem) > 1e-9:
+                        continue
+                    if abs(dmem) <= 1e-9:
+                        continue
+                    score = (self._fast_group_energy(gid, ci) - cur_e) / abs(dmem)
+                    if best is None or score < best[0]:
+                        best = (score, gid, ci)
+            if best is None:
+                logger.warning("Approximate solver: memory repair stuck at %.4f "
+                               "(budget=[%.4f,%.4f]).", mem, low, high)
+                return
+            self._set_group(best[1], best[2])
+
+    # ------------------------------------------------------------------ #
+    # Write-back
+    # ------------------------------------------------------------------ #
+    def total_objective(self):
+        """Exact objective of the current assignment via decision_vars (for
+        verification); equals pulp.value(prob.objective) after write-back."""
+        total = 0.0
+        for v in self.cost_bearing:
+            node = self.opt.nodes[v]
+            o = self.cur_out[v]
+            strat = self.opt.strats[node].strategies[o]
+            prod = self._arg_prod.get(v, {})
+            n_args = len(strat.redistribute_cost)
+            c = 0.0
+            for argi in range(n_args):
+                p = prod.get(argi)
+                inp = self.cur_out[p] if p is not None else 0
+                key = (v, argi, o, inp)
+                if key in self.forbidden:
+                    return INF
+                c += self.opt.decision_vars[key].cost
+            total += self.node_mult[v] * c
+        return total
+
+    def _write_back(self):
+        opt = self.opt
+        for var in opt.pulp_variables.values():
+            var.varValue = 0
+        selected = []
+        feasible = True
+        for v in self.cost_bearing:
+            node = opt.nodes[v]
+            o = self.cur_out[v]
+            strat = opt.strats[node].strategies[o]
+            prod = self._arg_prod.get(v, {})
+            for argi in range(len(strat.redistribute_cost)):
+                p = prod.get(argi)
+                inp = self.cur_out[p] if p is not None else 0
+                key = (v, argi, o, inp)
+                if key in self.forbidden:
+                    feasible = False
+                opt.pulp_variables[key].varValue = 1
+                selected.append(key)
+        opt.selected_keys = list(selected)
+        for rk in selected:
+            opt.selected_keys.extend(opt._root_to_linked.get(rk, []))
+        opt.prob.status = pulp.LpStatusOptimal
+        opt.prob.sol_status = pulp.LpSolutionOptimal
+        # Populate prob.objective so callers can score the assignment with
+        # pulp.value(prob.objective); the returned value uses the equivalent but
+        # cheaper total_objective() rather than evaluating the full expression.
+        opt._set_objective()
+        return INF if not feasible else self.total_objective()
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 06f2a4e6..9e0bf4f5 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -1231,7 +1231,15 @@ def apply_prefetch_discount(self, scale=0.0):
     # ---- Solution ----
 
     def _set_objective(self):
-        """Add the cost minimization objective to the ILP."""
+        """Add the cost minimization objective to the ILP.
+
+        Idempotent: a no-op if the objective has already been set. This lets the
+        approximate solver populate ``prob.objective`` (so its assignment can be
+        scored with ``pulp.value(prob.objective)``) without clobbering or
+        double-adding it, and keeps repeated get_solution() calls safe.
+        """
+        if self.prob.objective is not None:
+            return
         terms = []
         for key, dv in self.decision_vars.items():
             multiplier = 1 + len(self._root_to_linked.get(key, []))
diff --git a/examples/_bench_approx.py b/examples/_bench_approx.py
new file mode 100644
index 00000000..272c47aa
--- /dev/null
+++ b/examples/_bench_approx.py
@@ -0,0 +1,166 @@
+"""Benchmark approximate solver vs ILP: objective + solve time.
+
+Setting: LLaMA3 (1b default) on a 2D (dp, tp) mesh with vocab parallelism and
+the canonical example_llama3 constraints. Both solvers run on the SAME built
+optimizer: approx first (it only fills varValues/objective via an idempotent
+_set_objective), then a fresh CBC solve for the ILP. This avoids building the
+(expensive) strategy graph twice.
+
+Env knobs: MODEL_TYPE (1b|8b), MESH ("8,8"), N_LAYERS (0=default), SEQLEN,
+REPEATED (1|0), RUN_ILP (1|0), ILP_TIMEOUT (seconds, 0=unlimited).
+"""
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import pulp
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+_alog = logging.getLogger("autoparallel.approximate_sharding")
+_alog.setLevel(logging.INFO)
+_alog.addHandler(logging.StreamHandler())
+
+
+def log(msg):
+    print(msg, flush=True)
+
+
+_PATCHES = [
+    patch("torch.cuda.device_count", lambda: 8),
+    patch("torch.cuda.get_device_name", lambda *a, **k: "H100"),
+    patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)),
+    patch(
+        "torch.cuda.get_device_properties",
+        lambda *a, **k: type(
+            "P", (), {"major": 9, "minor": 0, "name": "H100",
+                      "total_memory": 80 * 1024**3, "multi_processor_count": 132}
+        )(),
+    ),
+]
+for p in _PATCHES:
+    p.start()
+
+MODEL_TYPE = os.environ.get("MODEL_TYPE", "1b")
+N_LAYERS = int(os.environ.get("N_LAYERS", "0"))
+SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4)))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
+REPEATED = os.environ.get("REPEATED", "1") == "1"
+RUN_ILP = os.environ.get("RUN_ILP", "1") == "1"
+LP_BOUND = os.environ.get("LP_BOUND", "1") == "1"
+ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "1200"))
+
+world_size = 1
+for d in MESH_SHAPE:
+    world_size *= d
+
+_NAMES = {1: ("dp",), 2: ("dp", "tp"), 3: ("dp", "cp", "tp"),
+          4: ("dp", "cp", "tp", "ep")}
+mesh_names = _NAMES[len(MESH_SHAPE)]
+fake_store = FakeStore()
+torch.distributed.init_process_group("fake", store=fake_store, rank=0, world_size=world_size)
+mesh = torch.distributed.device_mesh.init_device_mesh(
+    "cuda", MESH_SHAPE, mesh_dim_names=mesh_names
+)
+
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+seqlen = SEQLEN
+
+
+def model_fn():
+    if MODEL_TYPE == "1b":
+        args = TransformerModelArgs(
+            dim=2048, n_layers=16, n_heads=32, n_kv_heads=8,
+            ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000,
+            vocab_size=vocab_size, max_seq_len=seqlen,
+        )
+    elif MODEL_TYPE == "8b":
+        args = TransformerModelArgs(
+            dim=4096, n_layers=32, n_heads=32, n_kv_heads=8,
+            ffn_dim_multiplier=1.3, multiple_of=1024, rope_theta=500000,
+            vocab_size=vocab_size, max_seq_len=seqlen,
+        )
+    else:
+        raise ValueError(MODEL_TYPE)
+    if N_LAYERS:
+        args.n_layers = N_LAYERS
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda")
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+
+log(f"model={MODEL_TYPE} layers={N_LAYERS or 'default'} mesh={MESH_SHAPE} "
+    f"world={world_size} seqlen={seqlen} repeated_subgraphs={REPEATED} "
+    f"ilp_timeout={ILP_TIMEOUT}")
+
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=REPEATED)
+autop.__enter__()
+ndim = mesh.ndim
+x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1)
+# vocab-parallel output only defined for 2D (matches example_llama3); otherwise
+# constrain the output like the input.
+out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding
+autop.add_parameter_memory_constraint(low=None, high=None)
+autop.add_input_constraints([x_sharding])
+autop.add_output_constraints([out_sharding])
+opt = autop.sharding_optimizer
+log(f"[build] optimizer ready in {time.perf_counter() - t:.2f}s  "
+    f"vars={len(opt.pulp_variables)} constraints={len(opt.prob.constraints)} "
+    f"nodes={len(opt.nodes)}")
+
+# ---- APPROX ----
+t = time.perf_counter()
+approx = ApproximateShardingSolver(opt)
+approx.get_solution(verbose=True)
+ap_t = time.perf_counter() - t
+ap_obj = pulp.value(opt.prob.objective)
+prof = opt.profile.get("approximate", {})
+log(f"\n[APPROX] objective={ap_obj:.2f}  solve_time={ap_t:.3f}s")
+log(f"         groups={prof.get('groups')} sweeps={prof.get('sweeps')} "
+    f"build={prof.get('build_s'):.3f}s search={prof.get('solve_s'):.3f}s "
+    f"writeback={ap_t - prof.get('build_s', 0) - prof.get('solve_s', 0):.3f}s")
+
+# ---- LP relaxation lower bound (certified suboptimality upper bound) ----
+if LP_BOUND:
+    lb_res = opt.get_lower_bound(verbose=False)
+    lb = lb_res.objective
+    if lb and lb > 0:
+        cert = (ap_obj - lb) / lb
+        log(f"\n[LP-bound] lower_bound={lb:.2f}  solve={lb_res.solve_s:.2f}s  "
+            f"=> approx within {cert*100:.2f}% of optimum (certified upper bound)")
+
+# ---- ILP (fresh CBC solve on the same problem) ----
+if RUN_ILP:
+    opt._set_objective()  # idempotent: objective already populated by approx
+    kw = {"msg": True}
+    if ILP_TIMEOUT > 0:
+        kw["timeLimit"] = ILP_TIMEOUT
+    log(f"\n[ILP] solving with CBC (timeLimit={ILP_TIMEOUT or 'none'})...")
+    t = time.perf_counter()
+    opt.prob.solve(pulp.PULP_CBC_CMD(**kw))
+    ilp_t = time.perf_counter() - t
+    ilp_obj = pulp.value(opt.prob.objective)
+    status = pulp.LpStatus[opt.prob.status]
+    log(f"[ILP]    objective={ilp_obj:.2f}  solve_time={ilp_t:.3f}s  status={status}")
+
+    gap = (ap_obj - ilp_obj) / ilp_obj
+    log(f"\n=== objective gap = {gap*100:+.2f}%   solve speedup = {ilp_t/ap_t:.1f}x ===")
+    log(f"=== within 20% ? {abs(gap) <= 0.20}   (ILP status: {status}) ===")
diff --git a/tests/test_approximate_sharding.py b/tests/test_approximate_sharding.py
new file mode 100644
index 00000000..0bf06688
--- /dev/null
+++ b/tests/test_approximate_sharding.py
@@ -0,0 +1,140 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import pulp
+import pytest
+import torch
+from conftest import apply_cuda_patches
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor._dtensor_spec import DTensorSpec
+from torch.distributed.tensor.placement_types import Replicate, Shard
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+
+
+def _fake_2d_mesh():
+    return torch.distributed.device_mesh.init_device_mesh(
+        "cuda", (4, 2), mesh_dim_names=("dp", "tp")
+    )
+
+
+def _tiny_llama3_autop(mesh):
+    vocab_size = 128
+    seq_len = 16
+    batch_size = 2 * mesh.shape[0]
+    model_args = TransformerModelArgs(
+        dim=64,
+        n_layers=2,
+        n_heads=4,
+        n_kv_heads=2,
+        vocab_size=vocab_size,
+        multiple_of=32,
+        rope_theta=500000,
+        max_seq_len=seq_len,
+    )
+    with torch.device("meta"):
+        model = Transformer(model_args)
+
+    def input_fn():
+        return torch.randint(0, vocab_size, (batch_size, seq_len), device="cuda")
+
+    mp_policy = MixedPrecisionPolicy(
+        param_dtype=torch.bfloat16, reduce_dtype=torch.float32
+    )
+    return AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True)
+
+
+def _add_constraints(autop, mesh):
+    autop.add_parameter_memory_constraint(low=None, high=None)
+    autop.add_input_constraints([(Shard(0),) + (Replicate(),) * (mesh.ndim - 1)])
+    autop.add_output_constraints([(Shard(0), Shard(2))])
+
+
+@apply_cuda_patches
+@pytest.mark.filterwarnings("ignore:Constructing LpVariable")
+@pytest.mark.filterwarnings("ignore:Overwriting previously set objective")
+def test_approx_objective_close_to_ilp():
+    """The approximate solver should be much faster than the ILP while staying
+    within a small objective gap on a tiny LLaMA3 block + 2D mesh."""
+    mesh = _fake_2d_mesh()
+    with _tiny_llama3_autop(mesh) as autop:
+        _add_constraints(autop, mesh)
+        opt = autop.sharding_optimizer
+
+        autop.optimize_placement(verbose=False, solver="approx")
+        approx_objective = pulp.value(opt.prob.objective)
+        # The approx assignment must be ILP-feasible (flow consistency etc.);
+        # an infeasible assignment can score artificially low and silently pass
+        # the objective bound below.
+        violated = [n for n, c in opt.prob.constraints.items() if not c.valid()]
+        assert not violated, f"approx violated {len(violated)} constraints"
+
+        autop.optimize_placement(verbose=False, solver="ilp")
+        ilp_objective = pulp.value(opt.prob.objective)
+
+        assert math.isfinite(approx_objective)
+        assert ilp_objective > 0
+        assert approx_objective >= ilp_objective - 1e-6  # ILP is optimal
+        assert approx_objective <= ilp_objective * 1.20 + 1e-6, (
+            f"approx={approx_objective} ilp={ilp_objective} "
+            f"gap={(approx_objective / ilp_objective - 1) * 100:.1f}%"
+        )
+
+
+@apply_cuda_patches
+@pytest.mark.filterwarnings("ignore:Constructing LpVariable")
+def test_approx_objective_is_faithful():
+    """The solver's internal energy must equal the exact ILP objective evaluated
+    on its assignment (pulp.value), so comparisons against the ILP are valid."""
+    mesh = _fake_2d_mesh()
+    with _tiny_llama3_autop(mesh) as autop:
+        _add_constraints(autop, mesh)
+        opt = autop.sharding_optimizer
+
+        solver = ApproximateShardingSolver(opt)
+        solver.get_solution(verbose=False)
+
+        pulp_objective = pulp.value(opt.prob.objective)
+        internal_energy = solver.total_objective()
+        assert math.isfinite(internal_energy)
+        assert internal_energy == pytest.approx(pulp_objective, rel=1e-6)
+        # No forbidden decision variable should be selected.
+        assert all(key not in solver.forbidden for key in opt.selected_keys)
+        # And every ILP constraint must hold (flow consistency, paired, memory).
+        violated = [n for n, c in opt.prob.constraints.items() if not c.valid()]
+        assert not violated, f"approx violated {len(violated)} constraints"
+
+
+@apply_cuda_patches
+@pytest.mark.filterwarnings("ignore:Constructing LpVariable")
+def test_approx_respects_input_output_constraints():
+    """User input/output placement constraints must be honored by the solution."""
+    mesh = _fake_2d_mesh()
+    x_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1)
+    out_sharding = (Shard(0), Shard(2))
+    with _tiny_llama3_autop(mesh) as autop:
+        autop.add_parameter_memory_constraint(low=None, high=None)
+        autop.add_input_constraints([x_sharding])
+        autop.add_output_constraints([out_sharding])
+
+        solution = autop.optimize_placement(verbose=False, solver="approx")
+        assert solution
+
+        placements = {
+            spec.placements
+            for strat in solution.values()
+            for spec in (
+                strat.output_specs
+                if isinstance(strat.output_specs, (list, tuple))
+                else (strat.output_specs,)
+            )
+            if isinstance(spec, DTensorSpec)
+        }
+        assert x_sharding in placements
+        assert out_sharding in placements

From d06957fbec8972a651f29bbc5fe8df75856e77bd Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sat, 30 May 2026 16:49:23 -0700
Subject: [PATCH 06/27] Speed up optimizer build by skipping PuLP for the
 approximate solver

The optimizer build (strategy enumeration, decision vars, PuLP variables and
constraints) dominates end-to-end time, especially on 3D meshes where it
constructs ~14M PuLP variables and ~6M constraints that the approximate solver
never needs. Two result-preserving changes cut build time:

- Hoist the per-node _all_input_nodes / producer-strategy lookups out of the
  inner decision-var loops (they were recomputed once per decision variable,
  ~14M times on 3D); this also speeds up the ILP build.
- Add ShardingOptimizer(build_pulp=False), selected via
  AutoParallel(solver="approx"), which skips PuLP variable and constraint
  construction entirely. The approximate solver then derives the constraint
  topology directly from the graph + cluster links + constraint log
  (_topology_direct), verified byte-identical to parsing the PuLP constraints.

On LLaMA3 1B the build drops ~2.1x (2D) and ~3.3x (3D, ~13min -> ~4min) with
byte-identical placements; 3D end-to-end goes ~17min -> ~5min.
test_lite_build_matches_full guards the equivalence.

Authored with Claude.
---
 autoparallel/api.py                  |  33 ++++-
 autoparallel/approximate_sharding.py | 185 +++++++++++++++++++++++++--
 autoparallel/optimize_sharding.py    |  60 ++++++---
 tests/test_approximate_sharding.py   |  35 ++++-
 4 files changed, 274 insertions(+), 39 deletions(-)

diff --git a/autoparallel/api.py b/autoparallel/api.py
index 907d6111..e5356d1d 100644
--- a/autoparallel/api.py
+++ b/autoparallel/api.py
@@ -203,8 +203,15 @@ def __init__(
         dynamic: bool = False,
         cost_model: Any = "nccl",
         repeated_subgraphs: bool = True,
+        solver: str = "ilp",
     ):
         self.stack = ExitStack()
+        # "approx" builds a lighter optimizer (no PuLP variables/constraints),
+        # which is much faster to construct; optimize_placement(solver="approx")
+        # then solves it heuristically. "ilp" builds the full PuLP problem.
+        if solver not in ("ilp", "approx"):
+            raise ValueError(f"Unknown solver={solver!r}; expected 'ilp' or 'approx'")
+        self.solver = solver
         self.fake_mode = (
             FakeTensorMode()
         )  # TODO: maybe need to reuse the model's fake mode
@@ -281,6 +288,7 @@ def __enter__(self):
                 self.mesh,
                 force_grad_reduce_in_higher_precision,
                 repeated_subgraphs=self.repeated_subgraphs,
+                build_pulp=self.solver != "approx",
             )
 
             self.sharding_optimizer = sharding_optimizer
@@ -356,15 +364,19 @@ def add_output_constraints(self, constraints):
         self.sharding_optimizer.add_sharded_output_constraint(constraints)
         self.output_constraints = constraints
 
-    def optimize_placement(self, verbose=True, solver="ilp", approximate_options=None):
+    def optimize_placement(self, verbose=True, solver=None, approximate_options=None):
         """Solve for the optimal placement.
 
-        solver="ilp" (default) uses the exact PuLP/CBC solver. solver="approx"
-        uses the heuristic ApproximateShardingSolver, which trades a small
-        objective gap for a much faster solve. approximate_options is forwarded
-        as kwargs to the approximate solver (e.g. candidate_limit, max_sweeps).
+        solver="ilp" uses the exact PuLP/CBC solver. solver="approx" uses the
+        heuristic ApproximateShardingSolver, which trades a small objective gap
+        for a much faster solve. approximate_options is forwarded as kwargs to
+        the approximate solver (e.g. candidate_limit, max_sweeps). Defaults to the
+        solver chosen at AutoParallel construction; note an optimizer built with
+        solver="approx" has no PuLP problem and cannot run the ILP.
         """
         self._assert_entered()
+        if solver is None:
+            solver = self.solver
 
         if solver in ("approx", "approximate"):
             from .approximate_sharding import ApproximateShardingSolver
@@ -374,6 +386,12 @@ def optimize_placement(self, verbose=True, solver="ilp", approximate_options=Non
             )
             self.sharding_placement = approx.get_solution(verbose=verbose)
         elif solver == "ilp":
+            if self.sharding_optimizer.prob is None:
+                raise RuntimeError(
+                    "solver='ilp' requires a PuLP problem, but this AutoParallel "
+                    "was constructed with solver='approx' (no PuLP built). "
+                    "Construct with solver='ilp' to use the exact solver."
+                )
             self.sharding_placement = self.sharding_optimizer.get_solution(
                 verbose=False
             )
@@ -394,7 +412,10 @@ def optimize_placement(self, verbose=True, solver="ilp", approximate_options=Non
             ),
         )
 
-        if self.sharding_optimizer.prob.status == -1:
+        if (
+            self.sharding_optimizer.prob is not None
+            and self.sharding_optimizer.prob.status == -1
+        ):
             raise RuntimeError(
                 "The sharding optimizer could not find a feasible solution. "
                 "This typically means the user-specified constraints are "
diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py
index aba6111c..7e1a945a 100644
--- a/autoparallel/approximate_sharding.py
+++ b/autoparallel/approximate_sharding.py
@@ -43,6 +43,9 @@
 
 import numpy as np
 import pulp
+import torch
+from torch.distributed.tensor._dtensor_spec import DTensorSpec
+from torch.distributed.tensor.placement_types import Replicate, Shard
 
 from .cost_models.compute_estimation import _get_sharded_shape_stride
 
@@ -293,7 +296,11 @@ def _build_problem(self):
             self.allowed_out[opt.node_map[node]] = list(range(len(strat.strategies)))
 
         t = time.perf_counter()
-        paired_edges, authoritative = self._parse_constraints()
+        if opt.prob is None:
+            # Lite build: no PuLP problem was constructed, derive topology directly.
+            paired_edges, authoritative = self._topology_direct()
+        else:
+            paired_edges, authoritative = self._parse_constraints()
         # Flow edges are taken from the ILP's output_input_consistent constraints
         # (the authoritative producer per consumer-arg), NOT from _all_input_nodes:
         # the two disagree for some ops (einsum list-args, alias/backward nodes),
@@ -425,6 +432,159 @@ def _parse_constraints(self):
                 self.allowed_out[n] = [o for o in self.allowed_out[n] if o in out_set]
         return paired_edges, authoritative
 
+    def _topology_direct(self):
+        """Compute the same topology (forbidden / out_idx restrictions / paired
+        edges / flow producers) that _parse_constraints extracts, but directly
+        from the graph + cluster_links + _constraint_log, WITHOUT a PuLP problem.
+        This lets the optimizer skip building millions of PuLP variables and
+        constraints when only the approximate solver is used.
+
+        Mirrors ShardingOptimizer.add_inf_cost_constraint /
+        add_grad_reduce_dtype_constraints / add_forward_backward_consistency_constraints /
+        _add_paired_output_constraint / add_node_constraint /
+        add_output_input_consistent_constraint. Verified byte-identical to
+        _parse_constraints on a full build (see tests)."""
+        from torch._functorch._aot_autograd.fx_utils import (
+            get_param_and_grad_nodes,
+            get_plain_input_and_grad_nodes,
+            get_plain_output_and_tangent_nodes,
+        )
+
+        opt = self.opt
+        cl = opt.cluster_links
+
+        def rootkey(k):
+            return cl.get(k, k)
+
+        cluster_linked = {key[0] for key in cl}
+        node_root = {}
+        for lk, rk in cl.items():
+            node_root[lk[0]] = rk[0]
+
+        def nroot(idx):
+            return node_root.get(idx, idx)
+
+        # 1. inf-cost forbidden (== add_inf_cost_constraint).
+        for key, dv in opt.decision_vars.items():
+            if not math.isfinite(dv.cost) or dv.cost == 10000.0:
+                self.forbidden.add(key)
+
+        # 2. grad-reduce-dtype forbidden (== add_grad_reduce_dtype_constraints).
+        if getattr(opt, "force_grad_reduce_in_higher_precision", False):
+            cast_op = torch.ops.autoparallel.dtype_cast.default
+            pre_cast: set[int] = set()
+            for param, grad in get_param_and_grad_nodes(opt.graph).values():
+                if grad is None:
+                    continue
+                chain = [grad]
+                n = grad
+                while len(n.all_input_nodes) == 1:
+                    parent = n.all_input_nodes[0]
+                    if len(parent.all_input_nodes) != 1:
+                        break
+                    chain.append(parent)
+                    n = parent
+                cast_idx = next(
+                    (i for i, nd in enumerate(chain) if nd.target == cast_op), None
+                )
+                if cast_idx is None:
+                    continue
+                for nd in chain[cast_idx:]:
+                    if nd in opt.node_map:
+                        pre_cast.add(opt.node_map[nd])
+            for key, dv in opt.decision_vars.items():
+                if key[0] in pre_cast and dv.comm_cost > 0:
+                    self.forbidden.add(key)
+
+        # 3. forward/backward paired output constraints + disables
+        #    (== add_forward_backward_consistency_constraints / _add_paired_output_constraint).
+        paired_edges: list[tuple[int, int, frozenset]] = []
+
+        def add_paired(node_a, node_b):
+            idx_a, idx_b = opt.node_map[node_a], opt.node_map[node_b]
+            strat_a = [str(s.output_specs) for s in opt.strats[node_a].strategies]
+            strat_b = [str(s.output_specs) for s in opt.strats[node_b].strategies]
+            num_inp_a = len(opt.strats[node_a].strategies[0].redistribute_cost[0])
+            for out_idx, sp in enumerate(strat_a):
+                if sp not in strat_b:
+                    for inp in range(num_inp_a):
+                        self.forbidden.add(rootkey((idx_a, 0, out_idx, inp)))
+                    continue
+                out_idx_b = strat_b.index(sp)
+                ra = rootkey((idx_a, 0, out_idx, 0))[0]
+                rb = rootkey((idx_b, 0, out_idx_b, 0))[0]
+                paired_edges.append((ra, rb, frozenset({(out_idx, out_idx_b)})))
+
+        for param, grad in get_param_and_grad_nodes(opt.graph).values():
+            if grad is not None:
+                add_paired(param, grad)
+        for node, gnode in get_plain_input_and_grad_nodes(opt.graph).values():
+            if gnode is not None:
+                add_paired(node, gnode)
+        for node, tnode in get_plain_output_and_tangent_nodes(opt.graph).values():
+            if tnode is not None:
+                add_paired(node, tnode)
+
+        # 4. user node/input/output placement restrictions (== add_node_constraint),
+        #    replayed from _constraint_log.
+        restrict: dict[int, set] = {}
+        for fname, kwargs in getattr(opt, "_constraint_log", []):
+            if fname != "add_node_constraint":
+                continue
+            node = next(
+                (nd for nd in opt.nodes if nd.name == kwargs["node_name"]), None
+            )
+            if node is None or node not in opt.strats:
+                continue
+            placement = kwargs["placement"]
+            if placement is None:
+                placement = (Shard(0),) + (Replicate(),) * (opt.mesh.ndim - 1)
+            out_set = set()
+            for i, s in enumerate(opt.strats[node].strategies):
+                specs = s.output_specs
+                if isinstance(specs, DTensorSpec):
+                    if specs.placements == placement:
+                        out_set.add(i)
+                elif isinstance(specs, (list, tuple)):
+                    for spec in specs:
+                        if isinstance(spec, DTensorSpec):
+                            if spec.placements == placement:
+                                out_set.add(i)
+                            break
+            r = nroot(opt.node_map[node])
+            restrict[r] = restrict.get(r, out_set) & out_set
+        for n_idx, out_set in restrict.items():
+            if n_idx in self.allowed_out:
+                self.allowed_out[n_idx] = [
+                    o for o in self.allowed_out[n_idx] if o in out_set
+                ]
+
+        # 5. flow producers (== add_output_input_consistent_constraint): for each
+        #    consumer-arg, the set of (cluster-resolved) producers feeding it.
+        authoritative: dict[tuple[int, int], set] = {}
+        for node in opt.graph.nodes:
+            if node.op == "output" or node not in opt.node_map:
+                continue
+            p_idx = opt.node_map[node]
+            p_linked = p_idx in cluster_linked
+            p_root = nroot(p_idx)
+            for user in node.users:
+                if user.op == "output" or user not in opt.node_map:
+                    continue
+                u_idx = opt.node_map[user]
+                if p_linked and u_idx in cluster_linked:
+                    continue
+                ain = opt._all_input_nodes(user)
+                argi = next((i for i, x in enumerate(ain) if x is node), None)
+                if argi is None:
+                    continue
+                ispecs = opt.strats[user].strategies[0].input_specs
+                if argi < len(ispecs) and ispecs[argi] is None:
+                    continue
+                authoritative.setdefault((nroot(u_idx), argi), set()).add(p_root)
+
+        return paired_edges, authoritative
+
     def _out_fully_forbidden(self, v, node, o):
         strat = self.opt.strats[node].strategies[o]
         for argi, costs in enumerate(strat.redistribute_cost):
@@ -1029,8 +1189,10 @@ def total_objective(self):
 
     def _write_back(self):
         opt = self.opt
-        for var in opt.pulp_variables.values():
-            var.varValue = 0
+        has_pulp = bool(opt.pulp_variables)
+        if has_pulp:
+            for var in opt.pulp_variables.values():
+                var.varValue = 0
         selected = []
         feasible = True
         for v in self.cost_bearing:
@@ -1044,15 +1206,18 @@ def _write_back(self):
                 key = (v, argi, o, inp)
                 if key in self.forbidden:
                     feasible = False
-                opt.pulp_variables[key].varValue = 1
+                if has_pulp:
+                    opt.pulp_variables[key].varValue = 1
                 selected.append(key)
         opt.selected_keys = list(selected)
         for rk in selected:
             opt.selected_keys.extend(opt._root_to_linked.get(rk, []))
-        opt.prob.status = pulp.LpStatusOptimal
-        opt.prob.sol_status = pulp.LpSolutionOptimal
-        # Populate prob.objective so callers can score the assignment with
-        # pulp.value(prob.objective); the returned value uses the equivalent but
-        # cheaper total_objective() rather than evaluating the full expression.
-        opt._set_objective()
+        # Populate prob.objective (when a PuLP problem exists) so callers can also
+        # score via pulp.value(prob.objective); the returned value uses the
+        # equivalent but cheaper total_objective(). In the lite (no-PuLP) build,
+        # there is no problem to populate.
+        if opt.prob is not None:
+            opt.prob.status = pulp.LpStatusOptimal
+            opt.prob.sol_status = pulp.LpSolutionOptimal
+            opt._set_objective()
         return INF if not feasible else self.total_objective()
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 9e0bf4f5..f66fd4c7 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -281,6 +281,7 @@ def __init__(
         force_grad_reduce_in_higher_precision=False,
         repeated_subgraphs=False,
         solver_backend="ilp",
+        build_pulp=True,
     ):
         self.orig_gm = gm
         if solver_backend not in {"ilp", "dp"}:
@@ -289,6 +290,13 @@ def __init__(
                 "expected 'ilp' or 'dp'"
             )
         self.solver_backend = solver_backend
+        # When False, skip creating PuLP variables and constraints entirely.
+        # decision_var costs + strategies + cluster_links are still built, which
+        # is all the approximate solver needs (it derives the constraint topology
+        # directly). This avoids constructing millions of PuLP objects on large /
+        # 3D meshes, where that dominates build time.
+        self.build_pulp = build_pulp
+        self.prob = None
         # The optimizer works on a concretized copy of the graph where all
         # symbolic shapes are replaced with their concrete hint values. This
         # centralizes dynamic-shape handling: the optimization pipeline
@@ -401,8 +409,9 @@ def __init__(
         )
         self.validate()
         t2 = time.perf_counter()
-        self.prob = pulp.LpProblem("AutoParallel", pulp.LpMinimize)
-        self.add_default_constraints()
+        if self.build_pulp:
+            self.prob = pulp.LpProblem("AutoParallel", pulp.LpMinimize)
+            self.add_default_constraints()
         t3 = time.perf_counter()
         decision_var_build_s = t1 - t0
         cost_estimation_s = self._decision_var_profile["cost_estimation_s"]
@@ -427,7 +436,7 @@ def __init__(
             }
         )
         n_unique_vars = len(self.pulp_variables)
-        n_constraints = len(self.prob.constraints)
+        n_constraints = len(self.prob.constraints) if self.prob is not None else 0
         self.profile["ilp"] = {
             "unique_variables": n_unique_vars,
             "logical_decision_variables": self._decision_var_profile[
@@ -819,7 +828,7 @@ def _build_decision_vars(self):
         """Build DecisionVar entries for every (node_idx, argi, out_idx, inp_idx)
         combination in the strategy space."""
         t_pulp_start = time.perf_counter()
-        self.pulp_variables = self._create_pulp_variables()
+        self.pulp_variables = self._create_pulp_variables() if self.build_pulp else {}
         t_pulp_end = time.perf_counter()
 
         # Precompute which node indices are cluster-linked so we can
@@ -845,24 +854,30 @@ def _build_decision_vars(self):
 
             num_args = len(op_strategy.strategies[0].input_specs)
 
+            # Hoisted out of the per-(out_idx, argi, inp_idx) loops: these depend
+            # only on the node, not on the strategy choice. Recomputing them per
+            # decision var was O(#vars) calls to _all_input_nodes (a tree_flatten
+            # each), which dominated build time on large/3D meshes.
+            all_input_nodes = self._all_input_nodes(node)
+            producer_strategies = [self.strats[n] for n in all_input_nodes]
+            pulp_variables = self.pulp_variables
+
             for out_idx, output_strategy in enumerate(op_strategy.strategies):
                 tc0 = time.perf_counter()
                 compute_cost = estimate_strategy_runtime_cost(node, output_strategy)
-                tc1 = time.perf_counter()
-                t_compute += tc1 - tc0
+                t_compute += time.perf_counter() - tc0
                 per_arg_compute = compute_cost / num_args
 
+                te0 = time.perf_counter()
                 for argi, redist_costs in enumerate(output_strategy.redistribute_cost):
+                    producer_strategy = (
+                        producer_strategies[argi]
+                        if argi < len(producer_strategies)
+                        else None
+                    )
+                    input_spec = output_strategy.input_specs[argi]
                     for inp_idx, default_comm_cost in enumerate(redist_costs):
                         key = (node_idx, argi, out_idx, inp_idx)
-
-                        all_input_nodes = self._all_input_nodes(node)
-                        producer_strategy = (
-                            self.strats[all_input_nodes[argi]]
-                            if all_input_nodes
-                            else None
-                        )
-                        te0 = time.perf_counter()
                         comm_cost, transition_cost = self._compute_edge_costs(
                             node,
                             output_strategy,
@@ -871,22 +886,19 @@ def _build_decision_vars(self):
                             default_comm_cost,
                             producer_strategy,
                         )
-                        te1 = time.perf_counter()
-                        t_edge += te1 - te0
-
                         redist_costs[inp_idx] = comm_cost
-
                         decision_vars[key] = DecisionVar(
-                            var=self.pulp_variables[key],
+                            var=pulp_variables[key] if pulp_variables else None,
                             cost=comm_cost + per_arg_compute + transition_cost,
                             compute_cost=per_arg_compute,
                             comm_cost=comm_cost,
                             sharding_transition_cost=transition_cost,
                             strategy=output_strategy,
                             output_spec=output_strategy.output_specs,
-                            input_spec=output_strategy.input_specs[argi],
+                            input_spec=input_spec,
                         )
                         n_vars += 1
+                t_edge += time.perf_counter() - te0
 
         # Batch-copy redistribute_cost from root strats to linked strats.
         # The root pass above updated redistribute_cost in place with
@@ -951,7 +963,7 @@ def _resolve_decision_var(self, key):
         node_idx, argi, out_idx, _ = key
         strategy = self.strats[self.nodes[node_idx]].strategies[out_idx]
         return DecisionVar(
-            var=self._get_pulp_variable(key),
+            var=self._get_pulp_variable(key) if self.pulp_variables else None,
             cost=root_dv.cost,
             compute_cost=root_dv.compute_cost,
             comm_cost=root_dv.comm_cost,
@@ -1644,6 +1656,8 @@ def _compute_solution_cost(self, solution):
     # ---- Logging ----
 
     def get_violated_constraints_log(self):
+        if self.prob is None:
+            return "Violated constraints: [] (no PuLP problem; lite build)"
         violated_constraints = [
             (k, c) for k, c in self.prob.constraints.items() if not c.valid()
         ]
@@ -2097,6 +2111,8 @@ def add_parameter_memory_constraint(
                 },
             )
         )
+        if self.prob is None:
+            return  # approx solver reads the factors from _constraint_log
         param_nodes: list[torch.fx.Node] = get_param_nodes(self.graph)
         elms: list[pulp.LpAffineExpression] = []
         budget_low: float = 0.0
@@ -2161,6 +2177,8 @@ def add_node_constraint(self, node, placement=None, constraint_name=None):
             raise RuntimeError(
                 f"Couldn't find appropriate constraint {node} {constraint_name} {placement}"
             )
+        if self.prob is None:
+            return []  # approx solver replays this from _constraint_log
         return self._add_node_constraint(
             node,
             output_constraint_indices=output_constraint_indices,
diff --git a/tests/test_approximate_sharding.py b/tests/test_approximate_sharding.py
index 0bf06688..0383fad8 100644
--- a/tests/test_approximate_sharding.py
+++ b/tests/test_approximate_sharding.py
@@ -24,7 +24,7 @@ def _fake_2d_mesh():
     )
 
 
-def _tiny_llama3_autop(mesh):
+def _tiny_llama3_autop(mesh, solver="ilp"):
     vocab_size = 128
     seq_len = 16
     batch_size = 2 * mesh.shape[0]
@@ -47,7 +47,9 @@ def input_fn():
     mp_policy = MixedPrecisionPolicy(
         param_dtype=torch.bfloat16, reduce_dtype=torch.float32
     )
-    return AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True)
+    return AutoParallel(
+        model, input_fn, mesh, mp_policy, repeated_subgraphs=True, solver=solver
+    )
 
 
 def _add_constraints(autop, mesh):
@@ -138,3 +140,32 @@ def test_approx_respects_input_output_constraints():
         }
         assert x_sharding in placements
         assert out_sharding in placements
+
+
+@apply_cuda_patches
+@pytest.mark.filterwarnings("ignore:Constructing LpVariable")
+def test_lite_build_matches_full():
+    """Building with solver="approx" skips PuLP variables/constraints (faster
+    setup); the resulting assignment must be byte-identical to running the
+    approximate solver on a full PuLP build."""
+    mesh = _fake_2d_mesh()
+
+    with _tiny_llama3_autop(mesh, solver="ilp") as autop:
+        _add_constraints(autop, mesh)
+        assert autop.sharding_optimizer.prob is not None
+        autop.optimize_placement(verbose=False, solver="approx")
+        obj_full = autop.sharding_optimizer.profile["approximate"]["objective"]
+        keys_full = set(autop.sharding_optimizer.selected_keys)
+
+    with _tiny_llama3_autop(mesh, solver="approx") as autop:
+        _add_constraints(autop, mesh)
+        # Lite build: no PuLP problem or variables were constructed.
+        assert autop.sharding_optimizer.prob is None
+        assert not autop.sharding_optimizer.pulp_variables
+        solution = autop.optimize_placement(verbose=False)
+        obj_lite = autop.sharding_optimizer.profile["approximate"]["objective"]
+        keys_lite = set(autop.sharding_optimizer.selected_keys)
+        assert solution
+
+    assert obj_lite == pytest.approx(obj_full, rel=1e-9)
+    assert keys_lite == keys_full

From 17fdb4e117e4b1291696c2762174969eabcec02c Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sat, 30 May 2026 16:51:42 -0700
Subject: [PATCH 07/27] Prune invalid sharding strategies and skip CBC integer
 preprocessing

The sharding ILP's LP relaxation is naturally integral, so CBC reaches the
optimum at the root with zero branch-and-bound. The solve time was dominated
by CBC's integer preprocessing churning through hundreds of thousands of
binary columns, ~30% of which are invalid (infinite-cost) strategy edges the
optimizer materialized only to immediately constrain to zero.

To review, start with optimize_sharding.py: _build_decision_vars now computes
each edge's cost up front and only creates a variable when it is finite,
recording the survivors in _valid_keys. The constraint builders and
_create_pulp_variables tolerate the pruned keys (a missing key is an empty,
i.e. zero, term), the same-output and flow constraints key explicitly by
output index instead of relying on positional alignment, and
add_inf_cost_constraint becomes a no-op for fresh builds. _solve then passes
"preprocess off" to CBC. serialization.py seeds _valid_keys on load so saved
optimizers match freshly built ones, and test_optimize_placement.py adds a
regression test for the invariant.

On LLaMA3-1B with a 2D mesh this drops the problem from 476176 to 335390
variables and 173442 to 29643 constraints, and the solve from ~66s to ~11s,
with the objective unchanged (48449.3483).

Authored with Claude.
---
 autoparallel/optimize_sharding.py | 190 ++++++++++++++++++++++--------
 autoparallel/serialization.py     |  26 ++--
 tests/test_optimize_placement.py  |  35 ++++++
 3 files changed, 192 insertions(+), 59 deletions(-)

diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 6b72878b..76d9b3c4 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -246,6 +246,10 @@ def __init__(
         # remove_constraints can keep this in sync.
         self._node_constraint_names: dict[str, str] = {}
         self._name_counters: dict[str, int] = {}
+        # Set by _build_decision_vars: the (node, arg, out, inp) keys whose
+        # strategy edge has finite cost. Invalid (infinite-cost) edges are
+        # pruned and get no variable. None means "no pruning filter".
+        self._valid_keys: set[tuple] | None = None
         t0 = time.perf_counter()
         self.strats = self.build_sharding_metadata()
         # nodes/node_map are derived from strats (not graph.nodes) so that
@@ -416,6 +420,12 @@ def _create_pulp_variables(self):
         Returns a dict mapping root (node_idx, argi, out_idx, inp_idx) keys
         to their PuLP variables. Linked keys are not stored; use
         _get_pulp_variable() to resolve them through cluster_links.
+
+        Keys whose strategy is invalid (infinite cost) are pruned: if
+        self._valid_keys is set, only those keys get a variable. These
+        variables would otherwise be forced to zero by an inf-cost
+        constraint, so skipping them shrinks the ILP without changing the
+        optimum (see _build_decision_vars).
         """
         cluster_linked_node_idxs = {key[0] for key in self.cluster_links}
 
@@ -428,6 +438,8 @@ def _create_pulp_variables(self):
                 continue
             for argi, out_idx, inp_idx in self.walk_over_options(node):
                 key = (node_idx, argi, out_idx, inp_idx)
+                if self._valid_keys is not None and key not in self._valid_keys:
+                    continue
                 root_node = self.nodes[node_idx]
                 pulp_variables[key] = pulp.LpVariable(
                     f"n={root_node},s={node_idx},arg={argi},"
@@ -439,9 +451,12 @@ def _create_pulp_variables(self):
 
     def _get_pulp_variable(self, key):
         """Look up the PuLP variable for a key, resolving through
-        cluster_links if the key belongs to a linked node."""
+        cluster_links if the key belongs to a linked node.
+
+        Returns None if the key was pruned (invalid/infinite-cost strategy).
+        """
         root_key = self.cluster_links.get(key, key)
-        return self.pulp_variables[root_key]
+        return self.pulp_variables.get(root_key)
 
     def _compute_edge_costs(
         self,
@@ -480,26 +495,33 @@ def _compute_edge_costs(
 
     def _build_decision_vars(self):
         """Build DecisionVar entries for every (node_idx, argi, out_idx, inp_idx)
-        combination in the strategy space."""
-        t_pulp_start = time.perf_counter()
-        self.pulp_variables = self._create_pulp_variables()
-        t_pulp_end = time.perf_counter()
+        combination in the strategy space.
 
+        Strategy edges whose total cost is infinite (invalid redistributions)
+        are pruned: no variable is created for them. Such a variable would be
+        forced to zero by an inf-cost constraint anyway, so dropping it leaves
+        the optimum unchanged while removing ~30% of the variables and the
+        corresponding ~80% of constraints that are pure ``var == 0`` bounds.
+        """
         # Precompute which node indices are cluster-linked so we can
         # copy costs from the root instead of recomputing them.
         self._cluster_linked_node_idxs = {key[0] for key in self.cluster_links}
 
         t_compute = 0.0
         t_edge = 0.0
-        n_vars = 0
+        n_pruned = 0
         n_cluster_copied = 0
 
+        t_pulp_start = time.perf_counter()
+        self.pulp_variables = {}
+        self._valid_keys: set[tuple] = set()
         decision_vars = {}
         strats_items = [
             (self.node_map[node], node, strat) for node, strat in self.strats.items()
         ]
 
-        # Build DVs for root nodes only (not cluster-linked).
+        # Build DVs for root nodes only (not cluster-linked). Compute the edge
+        # cost first and only materialize a variable when it is finite.
         for node_idx, node, op_strategy in strats_items:
             if node.op == "output":
                 continue
@@ -507,6 +529,7 @@ def _build_decision_vars(self):
                 continue
 
             num_args = len(op_strategy.strategies[0].input_specs)
+            all_input_nodes = self._all_input_nodes(node)
 
             for out_idx, output_strategy in enumerate(op_strategy.strategies):
                 tc0 = time.perf_counter()
@@ -516,15 +539,10 @@ def _build_decision_vars(self):
                 per_arg_compute = compute_cost / num_args
 
                 for argi, redist_costs in enumerate(output_strategy.redistribute_cost):
+                    producer_strategy = (
+                        self.strats[all_input_nodes[argi]] if all_input_nodes else None
+                    )
                     for inp_idx, default_comm_cost in enumerate(redist_costs):
-                        key = (node_idx, argi, out_idx, inp_idx)
-
-                        all_input_nodes = self._all_input_nodes(node)
-                        producer_strategy = (
-                            self.strats[all_input_nodes[argi]]
-                            if all_input_nodes
-                            else None
-                        )
                         te0 = time.perf_counter()
                         comm_cost, transition_cost = self._compute_edge_costs(
                             node,
@@ -539,9 +557,22 @@ def _build_decision_vars(self):
 
                         redist_costs[inp_idx] = comm_cost
 
+                        cost = comm_cost + per_arg_compute + transition_cost
+                        if not math.isfinite(cost):
+                            n_pruned += 1
+                            continue
+
+                        key = (node_idx, argi, out_idx, inp_idx)
+                        var = pulp.LpVariable(
+                            f"n={node},s={node_idx},arg={argi},"
+                            f"output_p={out_idx},input_p={inp_idx}",
+                            cat=pulp.LpBinary,
+                        )
+                        self.pulp_variables[key] = var
+                        self._valid_keys.add(key)
                         decision_vars[key] = DecisionVar(
-                            var=self.pulp_variables[key],
-                            cost=comm_cost + per_arg_compute + transition_cost,
+                            var=var,
+                            cost=cost,
                             compute_cost=per_arg_compute,
                             comm_cost=comm_cost,
                             sharding_transition_cost=transition_cost,
@@ -549,7 +580,6 @@ def _build_decision_vars(self):
                             output_spec=output_strategy.output_specs,
                             input_spec=output_strategy.input_specs[argi],
                         )
-                        n_vars += 1
 
         # Batch-copy redistribute_cost from root strats to linked strats.
         # The root pass above updated redistribute_cost in place with
@@ -570,16 +600,20 @@ def _build_decision_vars(self):
                     list(costs) for costs in root_spec.redistribute_cost
                 ]
         n_cluster_copied = len(self.cluster_links)
-        n_vars += n_cluster_copied
 
+        # Linked keys mirror their root's validity (redistribute_cost is copied
+        # from the root above), so only valid root keys map to linked keys.
         self._root_to_linked: dict[tuple, list[tuple]] = defaultdict(list)
         for linked_key, root_key in self.cluster_links.items():
-            self._root_to_linked[root_key].append(linked_key)
+            if root_key in self._valid_keys:
+                self._root_to_linked[root_key].append(linked_key)
 
+        t_pulp_end = time.perf_counter()
         logger.debug(
-            "_build_decision_vars breakdown (%d vars, %d cluster-copied): "
-            "pulp_vars=%.3fs, compute_cost=%.3fs, edge_cost=%.3fs",
-            n_vars,
+            "_build_decision_vars breakdown (%d vars, %d pruned-inf, %d cluster-copied): "
+            "build=%.3fs, compute_cost=%.3fs, edge_cost=%.3fs",
+            len(decision_vars),
+            n_pruned,
             n_cluster_copied,
             t_pulp_end - t_pulp_start,
             t_compute,
@@ -607,6 +641,24 @@ def _resolve_decision_var(self, key):
             input_spec=strategy.input_specs[argi],
         )
 
+    def _find_decision_var(self, node_idx, argi, out_idx):
+        """Return a DecisionVar for any surviving inp_idx of (node, arg, out),
+        or None if every edge for that output strategy was pruned.
+
+        compute_cost is identical across inp_idx for a given out_idx, so callers
+        that only need per-strategy costs can use whichever edge survived.
+        """
+        strategy = self.strats[self.nodes[node_idx]].strategies[out_idx]
+        n_inp = len(strategy.redistribute_cost[argi]) if strategy.redistribute_cost else 1
+        for inp_idx in range(n_inp):
+            key = (node_idx, argi, out_idx, inp_idx)
+            if key in self.decision_vars:
+                return self._resolve_decision_var(key)
+            root_key = self.cluster_links.get(key)
+            if root_key is not None and root_key in self.decision_vars:
+                return self._resolve_decision_var(key)
+        return None
+
     def _collect_vars(self, node, node_idx, argi, group_by, resolve_clusters=False):
         """Collect PuLP variables for a node's options, grouped by strategy index.
 
@@ -622,9 +674,11 @@ def _collect_vars(self, node, node_idx, argi, group_by, resolve_clusters=False):
             if key in self.cluster_links:
                 if not resolve_clusters:
                     continue
-                var = self.pulp_variables[self.cluster_links[key]]
+                var = self.pulp_variables.get(self.cluster_links[key])
             else:
-                var = self.pulp_variables[key]
+                var = self.pulp_variables.get(key)
+            if var is None:  # pruned (invalid/infinite-cost) strategy edge
+                continue
             group_key = out_idx if group_by == "out_idx" else inp_idx
             result.setdefault(group_key, []).append(var)
         return result
@@ -679,7 +733,9 @@ def add_unique_decision_constraint(self):
             arg_vars = {}
             for argi, out_idx, inp_idx in self.walk_over_options(node):
                 key = (node_idx, argi, out_idx, inp_idx)
-                var = self.pulp_variables[key]
+                var = self.pulp_variables.get(key)
+                if var is None:  # pruned (invalid) strategy edge
+                    continue
                 arg_vars.setdefault(argi, []).append(var)
             for eqs in arg_vars.values():
                 self.prob += (
@@ -703,20 +759,24 @@ def add_same_output_across_args_constraint(self):
                 continue
             if len(self._all_input_nodes(node)) <= 1:
                 continue
-            vars_per_output = {}
+            # Group vars by (argi, out_idx). Pruning can leave an arg with no
+            # vars for a given out_idx, so we key explicitly by out_idx rather
+            # than relying on positional alignment: a missing entry means an
+            # empty sum (== 0), which correctly forbids that output strategy.
+            num_args = len(self._all_input_nodes(node))
+            vars_per_output: dict[tuple[int, int], list] = {}
             for argi, out_idx, inp_idx in self.walk_over_options(node):
                 key = (node_idx, argi, out_idx, inp_idx)
-                var = self.pulp_variables[key]
+                var = self.pulp_variables.get(key)
+                if var is None:  # pruned (invalid) strategy edge
+                    continue
                 vars_per_output.setdefault((argi, out_idx), []).append(var)
-            eqs_per_arg = [[] for _ in self._all_input_nodes(node)]
-            for (argi, out_idx), value in vars_per_output.items():
-                eqs_per_arg[argi].append(pulp.lpSum(value))
-            arg0 = eqs_per_arg[0]
-            for arg_eqs in eqs_per_arg[1:]:
-                assert len(arg0) == len(arg_eqs)
-                for i in range(len(arg0)):
+            all_out_idxs = {oi for (_, oi) in vars_per_output}
+            for out_idx in all_out_idxs:
+                arg0_eq = pulp.lpSum(vars_per_output.get((0, out_idx), []))
+                for argi in range(1, num_args):
                     self.prob += (
-                        arg0[i] == arg_eqs[i],
+                        arg0_eq == pulp.lpSum(vars_per_output.get((argi, out_idx), [])),
                         self._get_next_name("same_across_args"),
                     )
 
@@ -790,13 +850,15 @@ def add_output_input_consistent_constraint(self):
                     )
                     continue
 
-                assert (
-                    vars_producer.keys() == vars_consumer.keys()
-                ), f"{vars_producer}, {vars_consumer}"
-
-                for k in vars_producer:
+                # Pruning can leave a producer output strategy with no matching
+                # consumer var (the consumer cannot accept that placement) or
+                # vice versa. Iterate the union and treat a missing side as an
+                # empty sum (== 0): this forbids the unmatched output strategy,
+                # exactly as the old inf-cost (== 0) variables did.
+                for k in vars_producer.keys() | vars_consumer.keys():
                     self.prob += (
-                        pulp.lpSum(vars_producer[k]) == pulp.lpSum(vars_consumer[k]),
+                        pulp.lpSum(vars_producer.get(k, []))
+                        == pulp.lpSum(vars_consumer.get(k, [])),
                         self._get_next_name("output_input_consistent"),
                     )
 
@@ -805,6 +867,11 @@ def add_inf_cost_constraint(self):
         are forced to zero.
 
         ∀i,a,o,j: c_{i,a,o,j} = ∞ ⟹ x_{i,a,o,j} = 0
+
+        Freshly built optimizers prune these edges in _build_decision_vars, so
+        no variable exists and this is a no-op. It still runs for optimizers
+        loaded from save files produced before pruning was introduced, whose
+        decision_vars may still contain infinite-cost entries.
         """
         for key, dv in self.decision_vars.items():
             if not math.isfinite(dv.cost):
@@ -886,7 +953,16 @@ def _set_objective(self):
 
     def _solve(self, verbose=False):
         self._apply_memory_constraint()
-        solver = pulp.PULP_CBC_CMD(msg=verbose)
+        # The sharding ILP has a near-totally-unimodular (flow-like) structure:
+        # CBC's LP relaxation is naturally integral, so it solves in seconds
+        # with zero branch-and-bound. CBC's integer *preprocessing* (probing,
+        # substitutions over hundreds of thousands of binary columns) is then
+        # pure overhead — it dominates the solve. Disabling it (correctness is
+        # unaffected; CBC still does full branch-and-bound if the relaxation is
+        # fractional) makes the solve ~10x faster on large graphs.
+        # Pass as a single string: PuLP prefixes each options entry with "-",
+        # so this becomes the CBC flag "-preprocess off".
+        solver = pulp.PULP_CBC_CMD(msg=verbose, options=["preprocess off"])
         # Use a dedicated temp directory for PuLP's intermediate files (.mps,
         # .sol, etc.) so they are always cleaned up, even if the process is
         # killed.  Without this, leftover files can fill up /tmp (tmpfs).
@@ -1072,8 +1148,12 @@ def _compute_solution_cost(self, solution):
 
             # Use pre-computed costs from decision vars instead of
             # estimate_strategy_runtime_cost, which needs node.meta["val"]
-            # (absent on loaded optimizers).
-            dv = self._resolve_decision_var((node_idx, 0, out_idx, 0))
+            # (absent on loaded optimizers). The (.,0,out_idx,0) edge may be
+            # pruned, so find any surviving inp_idx for arg 0 (compute_cost is
+            # identical across inp_idx for a given out_idx).
+            dv = self._find_decision_var(node_idx, 0, out_idx)
+            if dv is None:
+                continue
             num_args = max(len(strategy.input_specs), 1)
             total_compute += dv.compute_cost * num_args
 
@@ -1408,6 +1488,8 @@ def _add_node_constraint(
         for argi, out_idx, inp_idx in self.walk_over_options(node):
             if out_idx in output_constraint_indices:
                 var = self._get_pulp_variable((node_idx, argi, out_idx, inp_idx))
+                if var is None:  # pruned (invalid) strategy edge
+                    continue
                 vars_per_arg.setdefault(argi, []).append(var)
         names = []
         for eqs in vars_per_arg.values():
@@ -1435,8 +1517,10 @@ def _add_paired_output_constraint(self, node_a, node_b, constraint_name):
                 # This placement exists in node_a but not in node_b.
                 # Disable it: force sum of its decision variables to 0.
                 v_a = [
-                    self._get_pulp_variable((idx_a, 0, out_idx, inp_idx))
+                    v
                     for inp_idx in range(num_inp_a)
+                    if (v := self._get_pulp_variable((idx_a, 0, out_idx, inp_idx)))
+                    is not None
                 ]
                 self.prob += (
                     pulp.lpSum(v_a) == 0,
@@ -1445,12 +1529,16 @@ def _add_paired_output_constraint(self, node_a, node_b, constraint_name):
                 continue
             out_idx_b = strat_b.index(sp)
             v_a = [
-                self._get_pulp_variable((idx_a, 0, out_idx, inp_idx))
+                v
                 for inp_idx in range(num_inp_a)
+                if (v := self._get_pulp_variable((idx_a, 0, out_idx, inp_idx)))
+                is not None
             ]
             v_b = [
-                self._get_pulp_variable((idx_b, 0, out_idx_b, inp_idx))
+                v
                 for inp_idx in range(num_inp_b)
+                if (v := self._get_pulp_variable((idx_b, 0, out_idx_b, inp_idx)))
+                is not None
             ]
             self.prob += (
                 pulp.lpSum(v_b) == pulp.lpSum(v_a),
@@ -1653,7 +1741,9 @@ def _apply_memory_constraint(self):
             num_out_strat = len(self.strats[node].strategies)
             ratios: list[float] = []
             for out_idx in range(num_out_strat):
-                dv = self._resolve_decision_var((node_idx, 0, out_idx, 0))
+                dv = self._find_decision_var(node_idx, 0, out_idx)
+                if dv is None:  # every edge for this strategy was pruned
+                    continue
                 spec: DTensorSpec = dv.input_spec
                 assert spec.tensor_meta is not None
                 tensor_shape: torch.Size = spec.tensor_meta.shape
diff --git a/autoparallel/serialization.py b/autoparallel/serialization.py
index 46cb3fde..af85ab56 100644
--- a/autoparallel/serialization.py
+++ b/autoparallel/serialization.py
@@ -286,6 +286,19 @@ def load_optimizer(cls, path):
     # for add_node_constraint() default placement, without needing a PG
     opt.mesh = _MeshPlaceholder(save_dict["mesh_shape"], save_dict["mesh_dim_names"])
 
+    # Map saved decision-var keys to loaded node indices. Only these keys had
+    # a finite-cost (valid) strategy edge at save time; invalid edges were
+    # pruned and must not get a variable, so seed _valid_keys before creating
+    # the PuLP variables (see ShardingOptimizer._build_decision_vars).
+    save_node_names = save_dict["dv_costs_node_names"]
+    keys_t = save_dict["dv_costs_keys"].tolist()
+    vals_t = save_dict["dv_costs_vals"].tolist()
+    mapped_keys = [
+        (opt.node_map[nodes_by_name[save_node_names[k[0]]]], k[1], k[2], k[3])
+        for k in keys_t
+    ]
+    opt._valid_keys = set(mapped_keys)
+
     # Rebuild PuLP variables and decision vars from saved costs.
     t2 = time.perf_counter()
     opt.pulp_variables = opt._create_pulp_variables()
@@ -296,19 +309,14 @@ def load_optimizer(cls, path):
         len(opt.pulp_variables),
     )
     # Reconstruct decision_vars from compact tensors.
-    save_node_names = save_dict["dv_costs_node_names"]
-    keys_t = save_dict["dv_costs_keys"].tolist()
-    vals_t = save_dict["dv_costs_vals"].tolist()
     opt.decision_vars = {}
-    for (save_node_idx, argi, out_idx, inp_idx), (
+    for key, (
         compute_cost,
         comm_cost,
         transition_cost,
-    ) in zip(keys_t, vals_t):
-        node_name = save_node_names[save_node_idx]
-        node = nodes_by_name[node_name]
-        node_idx = opt.node_map[node]
-        key = (node_idx, argi, out_idx, inp_idx)
+    ) in zip(mapped_keys, vals_t):
+        node_idx, argi, out_idx, inp_idx = key
+        node = opt.nodes[node_idx]
         strategy = opt.strats[node].strategies[out_idx]
         opt.decision_vars[key] = DecisionVar(
             var=opt.pulp_variables[key],
diff --git a/tests/test_optimize_placement.py b/tests/test_optimize_placement.py
index 59a4cf7c..9325f1f5 100644
--- a/tests/test_optimize_placement.py
+++ b/tests/test_optimize_placement.py
@@ -841,3 +841,38 @@ def input_fn():
         # With memory budget enforced and no node constraint, the optimizer
         # should shard this param again
         assert solution[orig_node].output_specs.placements == (Shard(0),)
+
+
+@apply_cuda_patches
+def test_invalid_strategies_are_pruned(device_mesh_2d):
+    """Infinite-cost (invalid) strategy edges must not be materialized as
+    variables or constraints, and pruning them must not change the optimum."""
+    import math
+
+    mesh = device_mesh_2d
+    model_fn, input_fn = _make_model_and_input_fn(mesh, "transformer_block")
+    with torch.device("meta"):
+        model = model_fn()
+
+    with AutoParallel(model, input_fn, mesh) as autop:
+        autop.add_input_constraints([(Shard(0), Replicate())])
+        autop.add_output_constraints([(Shard(0), Replicate())])
+        autop.add_parameter_memory_constraint(low=None, high=None)
+        opt = autop.sharding_optimizer
+
+        # Invariant: every materialized decision var is finite-cost, and the
+        # PuLP variable set is exactly the set of valid (finite) keys.
+        assert all(math.isfinite(dv.cost) for dv in opt.decision_vars.values())
+        assert set(opt.pulp_variables) == opt._valid_keys
+        assert all(k in opt._valid_keys for k in opt.decision_vars)
+
+        # No inf-cost (== 0) constraints should be emitted any more.
+        assert not any(
+            name.startswith("inf_cases") for name in opt.prob.constraints
+        )
+
+        # The pruned problem must still solve to a valid solution.
+        solution = autop.optimize_placement()
+        param_nodes = get_param_nodes(autop.gm.graph)
+        for node in param_nodes:
+            assert node in solution

From 238443e3db2ab487bf6cdb9b104b446a0ac82f72 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sat, 30 May 2026 16:53:38 -0700
Subject: [PATCH 08/27] Add sharding annotations with Shardy-like propagation
 to shrink the ILP search space

Users can express a tensor-parallel plan as a few annotations and have it propagated through the graph, turning the unambiguous part into ILP constraints while leaving the genuine cost tradeoffs (FSDP/data axis, residual sequence-parallelism, collective placement) to the solver.

Review in this order: propagation.py introduces the propagation engine (per-mesh-axis, reshard-free, worklist fixpoint with priority rounds, pinning only Shard placements so the optimum stays reachable); optimize_sharding.py adds the primitives it emits -- per-axis node constraints (add_node_axis_constraint, with method="fix" that prunes decision variables instead of adding equality rows), memory-budget awareness of per-axis-pinned params, and solve_lp_relaxation for diagnosing/short-circuiting the solve; api.py exposes the user-facing annotate_* and propagate_annotations entry points. Then tests, example, and docs.

On LLaMA3-1B (2D mesh) the annotated path reaches the same objective as the full ILP on a ~36% smaller search space and solves faster. The LP relaxation is integral on this problem, so solve_lp_relaxation(extract=True) gives an even larger, exact speedup.

Authored with Claude.
---
 autoparallel/api.py                  | 162 +++++++++
 autoparallel/optimize_sharding.py    | 172 +++++++++-
 autoparallel/propagation.py          | 487 +++++++++++++++++++++++++++
 docs/README.md                       |   1 +
 docs/sharding_annotations.md         | 183 ++++++++++
 examples/example_llama3_annotated.py | 145 ++++++++
 tests/test_propagation.py            | 222 ++++++++++++
 7 files changed, 1371 insertions(+), 1 deletion(-)
 create mode 100644 autoparallel/propagation.py
 create mode 100644 docs/sharding_annotations.md
 create mode 100644 examples/example_llama3_annotated.py
 create mode 100644 tests/test_propagation.py

diff --git a/autoparallel/api.py b/autoparallel/api.py
index 1670d509..4fcb9ef5 100644
--- a/autoparallel/api.py
+++ b/autoparallel/api.py
@@ -44,6 +44,7 @@
 )
 from .module_construction import make_parallel_module
 from .optimize_sharding import ShardingOptimizer
+from .propagation import ShardingAnnotation, ShardingPropagator
 from .shardings.placement_options import _get_device_from_mesh
 from .tracing import (
     _add_unused_params_and_buffers,
@@ -287,6 +288,8 @@ def __enter__(self):
 
             self.input_constraints = None
             self.output_constraints = None
+            self._annotations: list[tuple[Any, ShardingAnnotation]] = []
+            self.propagation_result = None
 
             self.active = True
 
@@ -356,6 +359,165 @@ def add_output_constraints(self, constraints):
         self.sharding_optimizer.add_sharded_output_constraint(constraints)
         self.output_constraints = constraints
 
+    # ---- Sharding annotations (Shardy-like propagation) ----
+
+    def _normalize_placements(self, placements):
+        """Pad/validate a placement tuple to mesh.ndim, leaving missing trailing
+        axes open (``None``)."""
+        placements = tuple(placements)
+        if len(placements) > self.mesh.ndim:
+            raise ValueError(
+                f"annotation has {len(placements)} placements but mesh has "
+                f"{self.mesh.ndim} dims"
+            )
+        return placements + (None,) * (self.mesh.ndim - len(placements))
+
+    def _param_fqn_to_node(self):
+        from torch._functorch._aot_autograd.fx_utils import get_param_and_grad_nodes
+
+        graph = self.sharding_optimizer.graph
+        return {
+            desc.target: node
+            for desc, (node, _grad) in get_param_and_grad_nodes(graph).items()
+        }
+
+    def annotate_parameter(self, fqn, placements, priority=1):
+        """Annotate the sharding of one or more parameters.
+
+        ``fqn`` is a parameter fully-qualified name, optionally a glob pattern
+        (e.g. ``"layers.*.attention.wq.weight"``) to annotate the matching
+        parameter in every layer at once.  ``placements`` is a tuple of
+        :class:`Placement` (or ``None`` to leave a mesh axis open — typical for
+        the data/FSDP axis of a weight).  Weights default to a lower priority
+        than activations so the data-parallel axis wins shared-axis conflicts.
+        """
+        import fnmatch
+
+        placements = self._normalize_placements(placements)
+        fqn_map = self._param_fqn_to_node()
+        matched = [node for name, node in fqn_map.items() if fnmatch.fnmatch(name, fqn)]
+        if not matched:
+            raise ValueError(
+                f"No parameter matches {fqn!r}. Available parameters: "
+                f"{sorted(fqn_map)}"
+            )
+        for node in matched:
+            self._annotations.append((node, ShardingAnnotation(placements, priority)))
+        return matched
+
+    def annotate_input(self, idx, placements, priority=0):
+        """Annotate the sharding of graph input ``idx``."""
+        from torch._functorch._aot_autograd.fx_utils import (
+            get_plain_input_and_grad_nodes,
+        )
+
+        placements = self._normalize_placements(placements)
+        graph = self.sharding_optimizer.graph
+        nodes = {
+            desc.idx: node
+            for desc, (node, _grad) in get_plain_input_and_grad_nodes(graph).items()
+        }
+        if idx not in nodes:
+            raise ValueError(f"No graph input with index {idx}; have {sorted(nodes)}")
+        self._annotations.append((nodes[idx], ShardingAnnotation(placements, priority)))
+        return nodes[idx]
+
+    def annotate_output(self, idx, placements, priority=0):
+        """Annotate the sharding of graph output ``idx``."""
+        from torch._functorch._aot_autograd.fx_utils import (
+            get_plain_output_and_tangent_nodes,
+        )
+
+        placements = self._normalize_placements(placements)
+        graph = self.sharding_optimizer.graph
+        nodes = {
+            desc.idx: node
+            for desc, (node, _t) in get_plain_output_and_tangent_nodes(graph).items()
+        }
+        if idx not in nodes:
+            raise ValueError(f"No graph output with index {idx}; have {sorted(nodes)}")
+        self._annotations.append((nodes[idx], ShardingAnnotation(placements, priority)))
+        return nodes[idx]
+
+    def annotate_node(self, node, placements, priority=0):
+        """Annotate the sharding of an arbitrary graph node."""
+        placements = self._normalize_placements(placements)
+        self._annotations.append((node, ShardingAnnotation(placements, priority)))
+        return node
+
+    def _mirror_annotations_to_backward(self):
+        """Build extra propagation seeds on the backward twins of annotated
+        forward tensors.
+
+        A gradient shares the sharding of the value it is the gradient of, so a
+        forward annotation also pins its twin (parameter->grad, input->grad,
+        output->tangent).  Seeding the twins lets the TP plan propagate through
+        the backward pass too.  These seeds are only used for propagation: the
+        twins themselves stay unconstrained (handled by the forward/backward
+        consistency constraints), but their neighbors get determined.
+        """
+        from torch._functorch._aot_autograd.fx_utils import (
+            get_param_and_grad_nodes,
+            get_plain_input_and_grad_nodes,
+            get_plain_output_and_tangent_nodes,
+        )
+
+        graph = self.sharding_optimizer.graph
+        twin = {}
+        for _d, (node, grad) in get_param_and_grad_nodes(graph).items():
+            if grad is not None:
+                twin[node] = grad
+        for _d, (node, grad) in get_plain_input_and_grad_nodes(graph).items():
+            if grad is not None:
+                twin[node] = grad
+        for _d, (node, tangent) in get_plain_output_and_tangent_nodes(graph).items():
+            if tangent is not None:
+                twin[node] = tangent
+
+        mirrored = []
+        for node, ann in self._annotations:
+            if node in twin:
+                mirrored.append((twin[node], ann))
+        return mirrored
+
+    def propagate_annotations(self, verbose=True, aggressive=False, method="fix"):
+        """Propagate the registered annotations Shardy-style and turn the
+        unambiguously-determined nodes into ILP constraints, shrinking the
+        search space.  Returns a :class:`PropagationResult`.
+
+        Call this after the ``annotate_*`` / ``add_*_constraint`` calls and
+        before :meth:`optimize_placement`.
+
+        With ``aggressive=False`` (the default) only genuine ``Shard`` axes are
+        pinned, which keeps the full-ILP optimum reachable.  ``aggressive=True``
+        also pins ``Replicate`` / ``Partial`` axes for a larger reduction at the
+        cost of possibly forbidding cheaper reshard placements (e.g. sequence
+        parallelism), so the objective may move slightly off the optimum.
+
+        ``method`` is how each pin is enforced: ``"fix"`` (default) removes the
+        ruled-out decision variables (shrinks the problem; scales best on large
+        meshes), ``"constraint"`` adds removable ``== 1`` rows instead.
+        """
+        self._assert_entered()
+        propagator = ShardingPropagator(self.sharding_optimizer)
+        seeds = self._annotations + self._mirror_annotations_to_backward()
+        propagator.run(seeds)
+        self.propagation_result = propagator.apply_to_optimizer(
+            aggressive=aggressive, method=method
+        )
+        if verbose:
+            logger.info(
+                "Annotation propagation reduced the output-strategy search "
+                "space by %.1f%% (%d -> %d) via %d per-axis constraints on %d "
+                "nodes",
+                100.0 * self.propagation_result.reduction,
+                self.propagation_result.strategies_before,
+                self.propagation_result.strategies_after,
+                self.propagation_result.axis_constraints,
+                self.propagation_result.nodes_determined,
+            )
+        return self.propagation_result
+
     def optimize_placement(self, verbose=True):
         self._assert_entered()
 
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 6b72878b..1ac30431 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -245,6 +245,17 @@ def __init__(
         # so that _apply_memory_constraint can exclude constrained params and
         # remove_constraints can keep this in sync.
         self._node_constraint_names: dict[str, str] = {}
+        # Maps node_name → list of (mesh_dim, placement) per-axis constraints.
+        # A per-axis constraint keeps a param in the memory budget (unlike a full
+        # node constraint) but restricts which strategies it can use, so the
+        # budget must compute its best achievable memory ratio over only the
+        # strategies that satisfy these constraints.
+        self._node_axis_constraints: dict[
+            str, list[tuple[int, Placement]]
+        ] = defaultdict(list)
+        # Variables pinned to 0 by axis constraints applied with method="fix".
+        # Stored so they can be restored by remove_constraints / for re-solving.
+        self._fixed_vars: list = []
         self._name_counters: dict[str, int] = {}
         t0 = time.perf_counter()
         self.strats = self.build_sharding_metadata()
@@ -911,6 +922,73 @@ def _solve(self, verbose=False):
                 "using a larger mesh."
             )
 
+    def solve_lp_relaxation(self, verbose=False, frac_tol=1e-6, extract=False):
+        """Solve the continuous relaxation of the ILP (binary variables relaxed
+        to [0, 1]) and report diagnostics, restoring the binary categories on
+        exit so a later ILP solve is unaffected.
+
+        Returns a dict with the relaxation objective (a lower bound on the ILP
+        optimum), the solve time, the number/fraction of decision variables that
+        came out fractional, and the solver status.  This is the lens for
+        understanding why constraints (e.g. propagated annotations) speed up the
+        ILP: a relaxation that is tighter (objective closer to the ILP optimum)
+        and less fractional leaves branch-and-bound far less work.
+
+        For this sharding problem the relaxation is empirically integral, so the
+        relaxation optimum equals the ILP optimum.  With ``extract=True`` and an
+        integral solution, the dict also contains a ``"solution"`` key with the
+        per-node strategy dict (same form as :meth:`get_solution`) — i.e. the LP
+        relaxation can be used as a much cheaper exact solve, skipping
+        branch-and-bound.  ``"solution"`` is ``None`` when the relaxation came
+        out fractional.
+
+        Requires the objective to have been set (e.g. via a prior get_solution,
+        or _set_objective).
+        """
+        variables = self.prob.variables()
+        original_cats = [v.cat for v in variables]
+        self._apply_memory_constraint()
+        t0 = time.perf_counter()
+        try:
+            for v in variables:
+                v.cat = pulp.LpContinuous  # bounds are already [0, 1] for binaries
+            solver = pulp.PULP_CBC_CMD(msg=verbose)
+            with tempfile.TemporaryDirectory() as tmpdir:
+                solver.tmpDir = tmpdir
+                self.prob.solve(solver)
+            solve_time = time.perf_counter() - t0
+            objective = pulp.value(self.prob.objective)
+            n_fractional = 0
+            n_vars = 0
+            for v in variables:
+                val = v.value()
+                if val is None:
+                    continue
+                n_vars += 1
+                if min(val, 1.0 - val) > frac_tol:
+                    n_fractional += 1
+            solution = None
+            if extract and n_fractional == 0:
+                self.selected_keys = [
+                    key
+                    for key, dv in self.decision_vars.items()
+                    if dv.var.value() is not None and dv.var.value() > 0.5
+                ]
+                for root_key in list(self.selected_keys):
+                    self.selected_keys.extend(self._root_to_linked.get(root_key, []))
+                solution = self._to_orig_solution(self._extract_and_validate_solution())
+        finally:
+            for v, cat in zip(variables, original_cats):
+                v.cat = cat
+        return {
+            "objective": objective,
+            "solve_time": solve_time,
+            "n_fractional": n_fractional,
+            "n_vars": n_vars,
+            "status": pulp.LpStatus[self.prob.status],
+            "solution": solution,
+        }
+
     def _extract_and_validate_solution(self):
         """Validate the ILP solution and return the optimal strategy per node."""
         selected_by_node = {}
@@ -1651,7 +1729,14 @@ def _apply_memory_constraint(self):
                 continue
             node_idx = self.node_map[node]
             num_out_strat = len(self.strats[node].strategies)
+            # Per-axis constraints restrict which strategies this param may use,
+            # which raises its best achievable memory ratio (e.g. a param pinned
+            # to Replicate on the tensor axis can no longer be sharded there).
+            # The budget must reflect that, or it would under-allocate and make
+            # the problem spuriously infeasible.
+            axis_constraints = self._node_axis_constraints.get(node.name, [])
             ratios: list[float] = []
+            allowed_ratios: list[float] = []
             for out_idx in range(num_out_strat):
                 dv = self._resolve_decision_var((node_idx, 0, out_idx, 0))
                 spec: DTensorSpec = dv.input_spec
@@ -1663,7 +1748,12 @@ def _apply_memory_constraint(self):
                 ratio = new_size / old_size
                 ratios.append(ratio)
                 elms.append(dv.var * ratio)
-            best_ratio: float = min(ratios)
+                out_spec = self.strats[node].strategies[out_idx].output_specs
+                if isinstance(out_spec, DTensorSpec) and all(
+                    out_spec.placements[m] == p for m, p in axis_constraints
+                ):
+                    allowed_ratios.append(ratio)
+            best_ratio: float = min(allowed_ratios) if allowed_ratios else min(ratios)
             budget_low += max(best_ratio, memory_factor_low)
             budget_high += max(best_ratio, memory_factor_high)
 
@@ -1717,6 +1807,86 @@ def add_node_constraint(self, node, placement=None, constraint_name=None):
             self._node_constraint_names[name] = node.name
         return names
 
+    def add_node_axis_constraint(
+        self, node, mesh_dim, placement, constraint_name=None, method="constraint"
+    ):
+        """Force a node's output placement on a single mesh axis, leaving the
+        other axes free for the ILP.
+
+        This is the per-mesh-axis analogue of :meth:`add_node_constraint` and is
+        what sharding propagation emits: it can pin the tensor-parallel axis of a
+        weight while leaving the data axis open for FSDP.  Unlike
+        :meth:`add_node_constraint` it does *not* register the node in
+        ``_node_constraint_names``, so a partially-constrained parameter is still
+        counted by the memory budget and can be sharded on its free axes.
+
+        ``method`` controls how the pin is enforced:
+
+        * ``"constraint"`` adds an ``== 1`` equality over the matching decision
+          variables (removable by name via :meth:`remove_constraints`).
+        * ``"fix"`` instead sets the upper bound of the *non-matching* decision
+          variables to 0.  This shrinks the problem (the solver's presolve drops
+          fixed columns) rather than adding a row, which scales much better on
+          large meshes where adding thousands of equality rows otherwise slows
+          the solve.  It is not removable by constraint name.
+
+        For nodes with tuple output_specs the placement is matched against the
+        first DTensorSpec element, matching :meth:`add_node_constraint`.
+        """
+        node = self._normalize_node(node)
+        if constraint_name is None:
+            constraint_name = "axis_constraint"
+        self._constraint_log.append(
+            (
+                "add_node_axis_constraint",
+                {
+                    "node_name": node.name,
+                    "mesh_dim": mesh_dim,
+                    "placement": placement,
+                    "constraint_name": constraint_name,
+                    "method": method,
+                },
+            )
+        )
+        assert node in self.strats, (node, self.strats.keys())
+        strat = self.strats[node]
+        output_constraint_indices = []
+        for i, s in enumerate(strat.strategies):
+            specs = s.output_specs
+            spec = None
+            if isinstance(specs, DTensorSpec):
+                spec = specs
+            elif isinstance(specs, (list, tuple)):
+                spec = next((x for x in specs if isinstance(x, DTensorSpec)), None)
+            if spec is not None and spec.placements[mesh_dim] == placement:
+                output_constraint_indices.append(i)
+        if len(output_constraint_indices) == 0:
+            raise RuntimeError(
+                f"Couldn't find a strategy for {node} with {placement} on mesh "
+                f"dim {mesh_dim} (constraint {constraint_name})"
+            )
+        self._node_axis_constraints[node.name].append((mesh_dim, placement))
+        if method == "fix":
+            self._fix_node_output_indices(node, set(output_constraint_indices))
+            return []
+        return self._add_node_constraint(
+            node,
+            output_constraint_indices=output_constraint_indices,
+            constraint_name=constraint_name,
+        )
+
+    def _fix_node_output_indices(self, node, keep_out_idxs):
+        """Pin a node's output strategy by fixing every decision variable whose
+        out_idx is not in ``keep_out_idxs`` to 0 (upper bound)."""
+        node_idx = self.node_map[node]
+        for argi, out_idx, inp_idx in self.walk_over_options(node):
+            if out_idx in keep_out_idxs:
+                continue
+            var = self._get_pulp_variable((node_idx, argi, out_idx, inp_idx))
+            if var.upBound != 0:
+                var.upBound = 0
+                self._fixed_vars.append(var)
+
     def _add_io_placement_constraints(
         self,
         nodes_dict,
diff --git a/autoparallel/propagation.py b/autoparallel/propagation.py
new file mode 100644
index 00000000..ae1e5366
--- /dev/null
+++ b/autoparallel/propagation.py
@@ -0,0 +1,487 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Shardy-like sharding propagation to seed and shrink the ILP search space.
+
+The ILP in :mod:`optimize_sharding` enumerates, for every node, every valid
+combination of input/output placements and lets the solver pick the global
+optimum.  For large models this search space is enormous even though, in
+practice, a handful of user decisions ("these weights are tensor-parallel",
+"the batch is data-parallel") already pin down the strategy for the vast
+majority of the graph.
+
+This module lets the user attach a small number of *sharding annotations* and
+then propagates them through the graph the way `Shardy
+<https://github.com/openxla/shardy>`_ does: it pushes each known sharding along
+edges that require no resharding, narrowing every node's set of candidate
+strategies until the unambiguous nodes are fully determined.  Determined nodes
+are turned into ILP constraints, which collapses the search space and the solve
+time while leaving the genuinely ambiguous decisions (and where to place the
+necessary collectives) to the ILP.
+
+Key design points that mirror Shardy:
+
+* **Per-mesh-axis propagation.**  A placement is propagated one mesh axis at a
+  time.  This is what lets, e.g., the tensor-parallel sharding of a weight flow
+  through a matmul on the ``tp`` axis while the ``dp`` axis is independently
+  resolved (data-parallel batch, with FSDP all-gathers left to the ILP).  It is
+  the analogue of Shardy projecting tensor shardings onto per-factor axes.
+* **Conservative, reshard-free propagation.**  Along an edge we only narrow a
+  consumer to the placements it can take *without* a reshard from the producer
+  (zero ``redistribute_cost``).  At a genuine reshard boundary (a necessary
+  collective, e.g. an all-reduce or all-gather) no zero-cost option exists, so
+  propagation stops there and the ILP decides the collective.  This never
+  empties a domain.
+* **Priority rounds.**  Annotations carry a priority (lower = applied first,
+  matching Shardy).  Data/activation annotations propagate before weight
+  annotations so that, where they compete for the same mesh axis (the ``dp``
+  axis of a matmul), the data-parallel sharding wins and the weight is
+  all-gathered rather than the activation being resharded.
+"""
+
+import logging
+from collections import defaultdict, deque
+from dataclasses import dataclass, field
+from typing import Optional
+
+from torch.distributed.tensor._dtensor_spec import DTensorSpec
+from torch.distributed.tensor.placement_types import Placement
+
+logger = logging.getLogger(__name__)
+
+# A per-axis placement value; ``None`` means "open" (unconstrained on that axis).
+AxisPlacement = Optional[Placement]
+
+
+@dataclass(frozen=True)
+class ShardingAnnotation:
+    """A user-provided sharding hint for one tensor (graph node).
+
+    Args:
+        placements: one entry per mesh dimension.  Each entry is a
+            :class:`Placement` (e.g. ``Shard(0)``, ``Replicate()``) or ``None``
+            to leave that mesh axis open for propagation / the ILP to decide.
+            Leaving an axis open is the common case for weights: the user pins
+            the tensor-parallel axis and lets FSDP on the data axis be chosen by
+            the optimizer.
+        priority: lower numbers are propagated first.  Activation/IO hints
+            should have a smaller priority than weight hints so the
+            data-parallel axis wins shared-axis conflicts.
+    """
+
+    placements: tuple[AxisPlacement, ...]
+    priority: int = 0
+
+
+# Micro-strategy: a single strategy projected onto one mesh axis.
+# ``in_reqs`` is the per-axis input placement required for each tensor argument
+# (``None`` for non-tensor / undefined args); ``out`` is the per-axis output
+# placement produced.
+@dataclass(frozen=True)
+class _Micro:
+    in_reqs: tuple[AxisPlacement, ...]
+    out: AxisPlacement
+
+
+@dataclass
+class PropagationResult:
+    """Summary of a propagation run, for logging and tests."""
+
+    determined: dict = field(default_factory=dict)  # node -> [(mesh_dim, placement)]
+    strategies_before: int = 0
+    strategies_after: int = 0
+    nodes_touched: int = 0
+    nodes_determined: int = 0
+    axis_constraints: int = 0
+
+    @property
+    def reduction(self) -> float:
+        if self.strategies_before == 0:
+            return 0.0
+        return 1.0 - self.strategies_after / self.strategies_before
+
+
+class ShardingPropagator:
+    """Propagates sharding annotations over an optimizer's strategy graph.
+
+    The propagator works on the optimizer's concrete graph and reuses its
+    per-node ``OpStrategy`` list (``optimizer.strats``) as the per-op sharding
+    rules.  It maintains, for every single-output node and every mesh axis, the
+    set of still-feasible per-axis (input-requirement, output) micro-strategies
+    and shrinks them to a fixed point.
+    """
+
+    def __init__(self, optimizer):
+        self.opt = optimizer
+        self.mesh = optimizer.mesh
+        self.ndim = optimizer.mesh.ndim
+
+        # node -> list (indexed by mesh dim) of list[_Micro]
+        self.micros: dict = {}
+        # node -> list (indexed by mesh dim) of set[int] (feasible micro indices)
+        self.dom: dict = {}
+        # nodes whose domain has been narrowed below the initial full set
+        self.touched: set = set()
+        self._initial_strategy_count: dict = {}
+
+        self._build_micros()
+
+    # ---- construction ----
+
+    def _build_micros(self):
+        for node, op_strat in self.opt.strats.items():
+            if node.op == "output":
+                continue
+            strategies = op_strat.strategies
+            if not strategies:
+                continue
+            # Multi-output nodes (tuple output_specs, e.g. SDPA) are propagation
+            # barriers: there is no single output placement to project, so we
+            # neither narrow them nor propagate across them.  Their getitem
+            # users are single-output and handled normally.
+            if not isinstance(strategies[0].output_specs, DTensorSpec):
+                continue
+
+            args = self.opt._all_input_nodes(node)
+            n_args = len(args)
+            self._initial_strategy_count[node] = len(strategies)
+
+            per_axis_index: list = [dict() for _ in range(self.ndim)]
+            per_axis_micros: list = [[] for _ in range(self.ndim)]
+            for s in strategies:
+                out_pl = s.output_specs.placements
+                in_pls = []
+                for a in range(n_args):
+                    isp = s.input_specs[a] if a < len(s.input_specs) else None
+                    in_pls.append(
+                        isp.placements if isinstance(isp, DTensorSpec) else None
+                    )
+                for m in range(self.ndim):
+                    in_reqs = tuple(None if pl is None else pl[m] for pl in in_pls)
+                    micro = _Micro(in_reqs=in_reqs, out=out_pl[m])
+                    idx = per_axis_index[m]
+                    if micro not in idx:
+                        idx[micro] = len(per_axis_micros[m])
+                        per_axis_micros[m].append(micro)
+            self.micros[node] = per_axis_micros
+            self.dom[node] = [
+                set(range(len(per_axis_micros[m]))) for m in range(self.ndim)
+            ]
+
+    # ---- accessors ----
+
+    def _out_set(self, node, m) -> set:
+        micros = self.micros[node][m]
+        return {micros[i].out for i in self.dom[node][m]}
+
+    def _in_req_set(self, node, m, a) -> set:
+        micros = self.micros[node][m]
+        return {micros[i].in_reqs[a] for i in self.dom[node][m]}
+
+    def _consumer_edges(self, node):
+        """Yield (consumer, arg_index) for each tensor edge out of ``node``."""
+        for user in node.users:
+            if user not in self.dom:
+                continue
+            in_nodes = self.opt._all_input_nodes(user)
+            for a, src in enumerate(in_nodes):
+                if src is node:
+                    yield user, a
+
+    # ---- seeding ----
+
+    def seed(self, node, placements: tuple) -> bool:
+        node = self.opt._normalize_node(node)
+        if node not in self.dom:
+            logger.debug("seed: %s is not a single-output node, ignoring", node)
+            return False
+        changed = False
+        for m in range(self.ndim):
+            want = placements[m] if m < len(placements) else None
+            if want is None:
+                continue
+            micros = self.micros[node][m]
+            # Seeding is authoritative: recompute from the full strategy set so a
+            # user annotation overrides any earlier (lower-priority) propagation
+            # that may have narrowed this axis away from the annotated value.
+            keep = {i for i in range(len(micros)) if micros[i].out == want}
+            if not keep:
+                available = {micros[i].out for i in range(len(micros))}
+                raise ValueError(
+                    f"Annotation {placements} is not achievable for node "
+                    f"{node} on mesh dim {m}: this op only supports "
+                    f"{available} on that axis"
+                )
+            if keep != self.dom[node][m]:
+                self.dom[node][m] = keep
+                changed = True
+        if changed:
+            self.touched.add(node)
+        return changed
+
+    # ---- narrowing ----
+
+    def _narrow_from_producers(self, node) -> bool:
+        """Narrow ``node`` (as a consumer) toward reshard-free inputs."""
+        changed = False
+        args = self.opt._all_input_nodes(node)
+        for a, producer in enumerate(args):
+            if producer not in self.dom:
+                continue  # barrier or non-tensor producer
+            for m in range(self.ndim):
+                prod_outs = self._out_set(producer, m)
+                cur = self.dom[node][m]
+                micros = self.micros[node][m]
+                keep = {i for i in cur if micros[i].in_reqs[a] in prod_outs}
+                # Only tighten when a zero-reshard option exists; an empty keep
+                # means this edge is a genuine reshard boundary -> leave it to
+                # the ILP.
+                if keep and keep != cur:
+                    self.dom[node][m] = keep
+                    changed = True
+        return changed
+
+    def _narrow_from_consumer(self, node) -> bool:
+        """Narrow ``node`` (as a producer) toward what its single consumer wants.
+
+        Restricted to single-consumer producers: a multi-consumer value (e.g. a
+        residual stream) may legitimately be resharded for some consumers, so we
+        do not let one consumer dictate it.
+
+        Placeholders (parameters, buffers, graph inputs) are never narrowed this
+        way: their placement is the *stored* sharding, which legitimately differs
+        from the *compute* sharding the consumer needs by a reshard (e.g. an FSDP
+        all-gather on the data axis).  Inferring the storage sharding from the
+        consumer would wrongly pin, e.g., a weight to Replicate on the data axis
+        and defeat FSDP.  A placeholder's sharding comes only from its own
+        annotation; everything else about it is left to the ILP.
+        """
+        if node.op in ("placeholder", "get_attr"):
+            return False
+        edges = list(self._consumer_edges(node))
+        if len(edges) != 1:
+            return False
+        consumer, a = edges[0]
+        changed = False
+        for m in range(self.ndim):
+            cons_reqs = self._in_req_set(consumer, m, a)
+            cur = self.dom[node][m]
+            micros = self.micros[node][m]
+            keep = {i for i in cur if micros[i].out in cons_reqs}
+            if keep and keep != cur:
+                self.dom[node][m] = keep
+                changed = True
+        return changed
+
+    def _narrow_node(self, node) -> bool:
+        c1 = self._narrow_from_producers(node)
+        c2 = self._narrow_from_consumer(node)
+        changed = c1 or c2
+        if changed:
+            self.touched.add(node)
+        return changed
+
+    def propagate(self):
+        """Run the worklist narrowing to a fixed point."""
+        wl = deque(self.dom.keys())
+        inq = set(self.dom.keys())
+        steps = 0
+        while wl:
+            node = wl.popleft()
+            inq.discard(node)
+            steps += 1
+            if not self._narrow_node(node):
+                continue
+            # Re-enqueue neighbors whose domains may now narrow further.
+            neighbors = list(self.opt._all_input_nodes(node))
+            neighbors += [u for u in node.users]
+            for nb in neighbors:
+                if nb in self.dom and nb not in inq:
+                    wl.append(nb)
+                    inq.add(nb)
+        logger.debug("propagation fixpoint reached in %d worklist steps", steps)
+
+    # ---- results ----
+
+    def determined(self) -> dict:
+        """node -> list[(mesh_dim, placement)] for every determined axis of a
+        node that propagation actually touched."""
+        res = {}
+        for node in self.dom:
+            if node not in self.touched:
+                continue
+            axes = []
+            for m in range(self.ndim):
+                outs = self._out_set(node, m)
+                if len(outs) == 1:
+                    axes.append((m, next(iter(outs))))
+            if axes:
+                res[node] = axes
+        return res
+
+    def _feasible_strategy_count(self, node, determined_axes) -> int:
+        """How many of ``node``'s strategies satisfy all determined axes."""
+        strategies = self.opt.strats[node].strategies
+        count = 0
+        for s in strategies:
+            spec = s.output_specs
+            if not isinstance(spec, DTensorSpec):
+                count += 1
+                continue
+            if all(spec.placements[m] == p for m, p in determined_axes):
+                count += 1
+        return count
+
+    def run(self, annotations) -> dict:
+        """Seed ``annotations`` in priority order and propagate to a fixed point.
+
+        ``annotations`` is a list of ``(node, ShardingAnnotation)``.  Returns the
+        ``determined()`` mapping.
+        """
+        by_priority: dict = defaultdict(list)
+        for node, ann in annotations:
+            by_priority[ann.priority].append((node, ann))
+        for priority in sorted(by_priority):
+            for node, ann in by_priority[priority]:
+                self.seed(node, ann.placements)
+            self.propagate()
+        return self.determined()
+
+    def _paired_boundary_nodes(self) -> set:
+        """Backward nodes tied to a forward node by a forward/backward
+        consistency constraint: parameter gradients, input gradients, and output
+        tangents.  These must be left to the pairing (which mirrors the forward
+        decision onto them); constraining them independently can contradict it.
+        """
+        from torch._functorch._aot_autograd.fx_utils import (
+            get_param_and_grad_nodes,
+            get_plain_input_and_grad_nodes,
+            get_plain_output_and_tangent_nodes,
+        )
+
+        graph = self.opt.graph
+        nodes = set()
+        for _p, grad in get_param_and_grad_nodes(graph).values():
+            if grad is not None:
+                nodes.add(grad)
+        for _i, grad in get_plain_input_and_grad_nodes(graph).values():
+            if grad is not None:
+                nodes.add(grad)
+        for _o, tangent in get_plain_output_and_tangent_nodes(graph).values():
+            if tangent is not None:
+                nodes.add(tangent)
+        return nodes
+
+    def _backward_node_set(self) -> set:
+        """Nodes belonging to the backward pass: everything reachable from a
+        tangent (incoming-gradient) placeholder.
+
+        Propagation does not constrain these.  Their sharding is tied to the
+        forward pass by the optimizer's forward/backward consistency constraints
+        (param<->grad, input<->grad, output<->tangent), so constraining them
+        independently risks contradicting that pairing (e.g. forcing a weight's
+        gradient to a placement its parameter cannot take).  Leaving them to the
+        ILP keeps the problem feasible while the forward constraints already
+        collapse most of the backward search space through the pairing.
+        """
+        seeds = [
+            n
+            for n in self.opt.graph.nodes
+            if n.op == "placeholder" and n.name.startswith("tangents")
+        ]
+        backward = set()
+        stack = list(seeds)
+        while stack:
+            n = stack.pop()
+            for u in n.users:
+                if u not in backward:
+                    backward.add(u)
+                    stack.append(u)
+        return backward
+
+    def _total_strategy_count(self) -> int:
+        total = 0
+        for node, op_strat in self.opt.strats.items():
+            if node.op == "output":
+                continue
+            total += len(op_strat.strategies)
+        return total
+
+    def apply_to_optimizer(
+        self, forward_only=False, aggressive=False, method="fix"
+    ) -> PropagationResult:
+        """Emit per-axis constraints for every determined axis of every touched
+        node and return a summary of the search-space reduction.
+
+        Nodes the user already constrained explicitly are skipped, as are the
+        forward/backward *paired boundary* nodes (parameter/input gradients and
+        output tangents), whose sharding is decided by the pairing rather than
+        propagation.  When ``forward_only`` is set, all backward-pass nodes are
+        skipped (more conservative; only the forward graph is constrained).  A
+        node is also skipped if its determined axes do not co-occur in any single
+        strategy (a safety net, not expected in practice).
+
+        By default (``aggressive=False``) an axis is only pinned when it is a
+        genuine ``Shard``.  A Shard encodes the tensor-parallel structure the
+        annotations describe and is invariant in the optimum.  ``Replicate`` and
+        ``Partial`` are deliberately *not* pinned:
+
+        * Pinning ``Replicate`` would forbid the ILP from instead sharding that
+          axis (e.g. choosing sequence parallelism on the residual stream).
+        * ``Partial`` is a pending reduction whose collective (all-reduce /
+          reduce-scatter) the ILP places; pinning it fixes where the reduction
+          happens and can even be infeasible (a Partial value cannot be added to
+          a Replicate residual without first reducing it).
+
+        Both are genuine cost tradeoffs, so leaving them open keeps the optimum
+        reachable while costing little search-space reduction.
+
+        ``method`` is forwarded to :meth:`ShardingOptimizer.add_node_axis_constraint`:
+        ``"fix"`` (default) removes the ruled-out decision variables so the
+        problem actually shrinks, ``"constraint"`` adds equality rows instead.
+        """
+        determined = self.determined()
+        already = set(self.opt._node_constraint_names.values())
+        excluded = self._paired_boundary_nodes()
+        if forward_only:
+            excluded |= self._backward_node_set()
+
+        result = PropagationResult(determined=determined)
+        result.strategies_before = self._total_strategy_count()
+        result.nodes_touched = len(self.touched)
+
+        strategies_saved = 0
+        for node, axes in determined.items():
+            if node.name in already or node in excluded:
+                continue
+            pin_axes = [(m, p) for m, p in axes if aggressive or p.is_shard()]
+            if not pin_axes:
+                continue
+            full = len(self.opt.strats[node].strategies)
+            feasible = self._feasible_strategy_count(node, pin_axes)
+            if feasible == 0 or feasible == full:
+                continue
+            for m, p in pin_axes:
+                self.opt.add_node_axis_constraint(
+                    node, m, p, constraint_name="propagated", method=method
+                )
+                result.axis_constraints += 1
+            result.nodes_determined += 1
+            strategies_saved += full - feasible
+
+        result.strategies_after = result.strategies_before - strategies_saved
+        logger.info(
+            "propagation: touched %d nodes, constrained %d nodes with %d "
+            "per-axis constraints; output-strategy choices %d -> %d (%.1f%% "
+            "reduction)",
+            result.nodes_touched,
+            result.nodes_determined,
+            result.axis_constraints,
+            result.strategies_before,
+            result.strategies_after,
+            100.0 * result.reduction,
+        )
+        return result
diff --git a/docs/README.md b/docs/README.md
index 9299286f..4aa2dc2d 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -23,5 +23,6 @@ If you're new to the project, use the reading order below.
 
 ## Advanced usage
 
+- [Sharding Annotations and Shardy-like Propagation](sharding_annotations.md)
 - [Using `local_map` for MoE and Custom Communication Patterns](local_map_and_moe.md)
 - [Saving and Loading Optimizer State](save_load.md)
diff --git a/docs/sharding_annotations.md b/docs/sharding_annotations.md
new file mode 100644
index 00000000..b9248cb9
--- /dev/null
+++ b/docs/sharding_annotations.md
@@ -0,0 +1,183 @@
+# Sharding Annotations and Shardy-like Propagation
+
+By default AutoParallel hands the entire sharding decision to the ILP: every
+node enumerates every valid placement and the solver picks the global optimum.
+That is the right default for a fresh model, but at scale the search space is
+large even though the user often already knows the high-level plan — "the
+attention and MLP projections are tensor-parallel; the batch is data-parallel".
+
+This page describes how to express that plan as a few **sharding annotations**
+and have AutoParallel **propagate** them through the graph the way
+[Shardy](https://github.com/openxla/shardy) does, turning the unambiguous part
+of the graph into ILP constraints. This shrinks the search space and the solve
+time while leaving the genuine cost tradeoffs to the solver. With a typical
+tensor-parallel annotation on LLaMA-3 it reaches the *same* objective as the
+full ILP on a noticeably smaller problem.
+
+If you are new to the project, start with
+[Getting Started](getting_started.md) and
+[How AutoParallel Chooses a Strategy](how_autoparallel_chooses_a_strategy.md).
+
+## The annotation API
+
+Annotations are added on the `AutoParallel` context manager, after the input /
+output constraints and before `optimize_placement`:
+
+```python
+with AutoParallel(model, input_fn, mesh) as autop:
+    autop.add_parameter_memory_constraint(low=None, high=None)
+    autop.add_input_constraints([(Shard(0), Replicate())])
+    autop.add_output_constraints([(Shard(0), Shard(2))])
+
+    # Annotate the tensor-parallel plan. A glob matches the weight in every
+    # layer at once. Only the tp axis is pinned; the data axis is left open.
+    column_parallel = (None, Shard(0))   # shard the output dim
+    row_parallel = (None, Shard(1))      # shard the input dim
+    for proj in ["wq", "wk", "wv"]:
+        autop.annotate_parameter(f"layers.*.attention.{proj}.weight", column_parallel)
+    autop.annotate_parameter("layers.*.attention.wo.weight", row_parallel)
+    for proj in ["w1", "w3"]:
+        autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", column_parallel)
+    autop.annotate_parameter("layers.*.feed_forward.w2.weight", row_parallel)
+
+    autop.propagate_annotations()        # propagate + constrain
+    sharding = autop.optimize_placement()
+```
+
+A placement is a tuple with one entry per mesh dimension. Each entry is a
+`Placement` (`Shard(d)`, `Replicate()`, ...) or **`None`** to leave that mesh
+axis *open* for propagation / the ILP to decide. Leaving the data axis open is
+the common case for weights: you pin the tensor-parallel axis and let the
+optimizer choose FSDP vs DDP on the data axis.
+
+The available annotation methods are:
+
+- `annotate_parameter(fqn, placements, priority=1)` — `fqn` is a parameter
+  fully-qualified name or a glob pattern (e.g. `"layers.*.attention.wq.weight"`).
+- `annotate_input(idx, placements, priority=0)` /
+  `annotate_output(idx, placements, priority=0)` — graph input/output by index.
+- `annotate_node(node, placements, priority=0)` — an arbitrary FX node.
+
+`priority` controls the order annotations propagate (lower first, matching
+Shardy). Activations/IO default to a higher priority than weights so that where
+they compete for the same mesh axis (the data axis of a matmul) the
+data-parallel sharding wins and the weight is all-gathered, rather than the
+activation being resharded.
+
+`propagate_annotations()` returns a `PropagationResult` summarizing the
+reduction (`nodes_determined`, `axis_constraints`, `reduction`).
+
+## How propagation works
+
+Propagation mirrors the structure of Shardy's propagation, expressed over
+AutoParallel's existing per-node strategy lists (which already encode each op's
+sharding rule):
+
+- **Per-mesh-axis.** A placement is propagated one mesh axis at a time. This is
+  what lets a weight's tensor-parallel sharding flow through a matmul on the
+  `tp` axis while the `dp` axis is resolved independently (data-parallel batch,
+  with FSDP all-gathers left to the ILP). It is the analogue of Shardy
+  projecting tensor shardings onto per-factor axes.
+
+- **Reshard-free.** Along an edge a consumer is only narrowed to the placements
+  it can take *without* a reshard from the producer (zero redistribution cost).
+  At a genuine reshard boundary — a necessary collective such as an all-reduce
+  or all-gather — no zero-cost option exists, so propagation stops there and the
+  ILP decides the collective.
+
+- **To a fixed point.** A worklist re-examines a node's neighbors whenever its
+  set of candidate shardings shrinks, until nothing changes.
+
+- **Priority rounds.** Annotations propagate in priority order; later rounds
+  cannot override what an earlier round determined.
+
+Once propagation reaches a fixed point, every mesh axis of a node whose sharding
+became unambiguous is turned into a per-axis ILP constraint
+(`add_node_axis_constraint`), which constrains that one axis and leaves the rest
+of the node free.
+
+### What is and isn't pinned
+
+Propagation deliberately only pins genuine **`Shard`** placements — the
+tensor-parallel structure the annotations describe, which is invariant in the
+optimum. It does *not* pin:
+
+- **`Replicate`** — pinning it would forbid the ILP from instead sharding that
+  axis (for example choosing sequence parallelism on the residual stream).
+- **`Partial`** — a pending reduction whose collective the ILP places; pinning
+  it fixes where the reduction happens and can even be infeasible (a `Partial`
+  value cannot be added to a `Replicate` residual without first reducing it).
+
+Both are genuine cost tradeoffs, so leaving them open keeps the optimum
+reachable at little cost to the reduction.
+
+Two more correctness rules keep the constraint set feasible and faithful:
+
+- **Parameters are sources only.** A parameter's placement is its *stored*
+  sharding, which legitimately differs from the *compute* sharding a consumer
+  needs by a reshard (an FSDP all-gather). Propagation never infers a
+  parameter's sharding from its consumers, so an open data axis stays free for
+  FSDP, and a per-axis parameter constraint still counts toward the memory
+  budget on its free axes.
+- **Backward pass via the pairing.** The forward/backward consistency
+  constraints already tie each gradient to its forward tensor, so the
+  parameter/input gradients and output tangents are left for the pairing to
+  decide; the rest of the backward graph is constrained normally (and the
+  forward annotations are mirrored onto the gradients to drive that).
+
+## How a pin is applied: variable fixing vs constraints
+
+`propagate_annotations(method=...)` (forwarded to
+`ShardingOptimizer.add_node_axis_constraint`) controls how each determined axis
+is committed to the ILP:
+
+- **`"fix"` (default)** sets the upper bound of the ruled-out decision variables
+  to 0, so the solver's presolve drops those columns and the problem actually
+  shrinks.
+- **`"constraint"`** adds an `== 1` equality row over the matching variables.
+  It is removable by name, but on a large mesh adding thousands of rows without
+  removing any columns can *slow* the solve.
+
+Variable fixing is strictly better for solve time (and never worse for the
+objective), which is why it is the default.
+
+## Solver performance and the LP relaxation
+
+`ShardingOptimizer.solve_lp_relaxation()` solves the continuous relaxation
+(binaries relaxed to `[0, 1]`) and reports the objective, solve time, and how
+many variables came out fractional. It exposes two facts that matter for
+performance:
+
+1. **The relaxation is integral.** On LLaMA-3 (2D and 3D meshes), with and
+   without annotations, the LP relaxation comes out with *zero* fractional
+   variables and an integrality gap of 0% — its optimum already *is* the integer
+   optimum. So `solve_lp_relaxation(extract=True)` returns a valid optimal
+   per-node strategy dict (same form as `get_solution`) while skipping
+   branch-and-bound, which is several times faster than the MILP solve (e.g. on
+   the 16-layer 2D model, ~10s vs ~50s; on a 2M-variable 3D problem, ~45s vs
+   ~160s). This is the single biggest available speedup and is exact whenever
+   the relaxation is integral (it falls back to `None` when it is not).
+
+2. **Where annotations help the MILP.** Because the relaxation is integral,
+   there is little branch-and-bound to cut, so the annotation speedup is
+   scale-dependent: on a ~400k-variable problem the MILP overhead is a large
+   fraction and pinning the TP structure gives ~1.7–1.8×; on a ~2M-variable
+   problem the solve is dominated by the relaxation/model size itself, so the
+   speedup shrinks toward ~1× even though the *search space* shrinks more (the
+   extra mesh axis gives more axes to pin — e.g. −59% strategy choices on 3D vs
+   −36% on 2D). The annotation speedup on the *LP* solve is correspondingly
+   modest (~1.1–1.4×). The takeaway: annotations reduce the search space and
+   keep the optimum exact, but for raw solve time on this (integral) problem the
+   larger lever is solving the relaxation directly.
+
+A separate, orthogonal cost is that building the ILP for a 3-axis mesh is slow:
+per-node strategy enumeration grows with the number of mesh axes (it is cubic
+for a 3-axis mesh, dominated by the 4D attention tensors), which is independent
+of the solve and of annotations.
+
+## Example
+
+`examples/example_llama3_annotated.py` runs the full ILP and the
+annotated+propagation path on a LLaMA-3-1B model on a 2D mesh and prints the
+comparison: the annotated path reaches the same objective on a search space
+reduced by roughly a third, with a correspondingly faster solve.
diff --git a/examples/example_llama3_annotated.py b/examples/example_llama3_annotated.py
new file mode 100644
index 00000000..7e1f1ecb
--- /dev/null
+++ b/examples/example_llama3_annotated.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Sharding annotations + Shardy-like propagation on LLaMA3-1B (2D mesh).
+
+By default AutoParallel hands the whole sharding decision to the ILP.  At scale
+a user usually already knows the tensor-parallel plan ("these projections are
+column-parallel, those are row-parallel").  This example shows how to express
+that plan as a few *annotations*, propagate it through the graph the way Shardy
+does, and turn the unambiguous part of the graph into ILP constraints.
+
+The annotations pin only the **tensor-parallel (tp) axis** of the transformer
+body weights.  Everything else -- the data/FSDP axis, the residual stream
+(replicate vs sequence-parallel), the vocab/embedding sharding, and where the
+collectives go -- is left to the ILP.  Propagation then determines the sharding
+of the activations that *follow* from the plan with no resharding and constrains
+them, which shrinks the search space and the solve time while leaving the
+genuine cost tradeoffs to the solver.
+
+Run it (no GPUs needed -- uses a fake process group):
+
+    python examples/example_llama3_annotated.py
+"""
+
+import logging
+import time
+
+import pulp
+import torch
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import (
+    Transformer,
+    TransformerModelArgs,
+    apply_ac,
+)
+from autoparallel.api import AutoParallel
+
+logging.basicConfig(level=logging.WARNING)
+
+world_size = 64
+fake_store = FakeStore()
+torch.distributed.init_process_group(
+    "fake", store=fake_store, rank=0, world_size=world_size
+)
+
+# 2D mesh: data/FSDP on dp, tensor-parallel on tp.
+dp, tp = world_size // 8, 8
+mesh = torch.distributed.device_mesh.init_device_mesh(
+    "cuda", (dp, tp), mesh_dim_names=("dp", "tp")
+)
+
+# Small-batch / long-sequence regime, where tensor parallelism is worthwhile.
+vocab_size = 128256
+seqlen = 2048
+batch_size = 2 * dp
+
+
+def model_fn():
+    # LLaMA-3.2-1B-ish config.
+    return Transformer(
+        TransformerModelArgs(
+            dim=2048,
+            n_layers=16,
+            n_heads=32,
+            n_kv_heads=8,
+            ffn_dim_multiplier=1.5,
+            multiple_of=256,
+            rope_theta=500000,
+            vocab_size=vocab_size,
+            max_seq_len=seqlen,
+        )
+    )
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda")
+
+
+def annotate_tp_plan(autop):
+    """The 'conscious' tensor-parallel plan, as a handful of annotations.
+
+    Only the tp axis is pinned (the data axis is left ``None`` = open).  A glob
+    pattern annotates the matching weight in every layer at once.
+    """
+    column_parallel = (None, Shard(0))  # shard the output dim (dim 0 of [out, in])
+    row_parallel = (None, Shard(1))  # shard the input dim (dim 1 of [out, in])
+    for proj in ["wq", "wk", "wv"]:
+        autop.annotate_parameter(f"layers.*.attention.{proj}.weight", column_parallel)
+    autop.annotate_parameter("layers.*.attention.wo.weight", row_parallel)
+    for proj in ["w1", "w3"]:
+        autop.annotate_parameter(
+            f"layers.*.feed_forward.{proj}.weight", column_parallel
+        )
+    autop.annotate_parameter("layers.*.feed_forward.w2.weight", row_parallel)
+
+
+with torch.device("meta"):
+    model = model_fn()
+apply_ac(model, mode="full")
+
+with AutoParallel(model, input_fn, mesh, repeated_subgraphs=True) as autop:
+    autop.add_parameter_memory_constraint(low=None, high=None)
+    autop.add_input_constraints([(Shard(0), Replicate())])
+    autop.add_output_constraints([(Shard(0), Shard(2))])  # vocab-parallel logits
+    opt = autop.sharding_optimizer
+    print(
+        f"ILP: {len(opt.strats)} nodes, {len(opt.decision_vars)} decision variables "
+        f"on a ({dp}, {tp}) mesh"
+    )
+
+    # --- Baseline: full ILP, no annotations ---
+    t = time.perf_counter()
+    autop.optimize_placement(verbose=False)
+    t_baseline = time.perf_counter() - t
+    obj_baseline = pulp.value(opt.prob.objective)
+    print(
+        f"baseline full ILP : objective {obj_baseline:11.1f}   solve {t_baseline:6.1f}s"
+    )
+
+    # --- Annotated: propagate the TP plan, then solve the reduced problem ---
+    annotate_tp_plan(autop)
+    result = autop.propagate_annotations(verbose=False)
+    t = time.perf_counter()
+    opt.resolve(verbose=False)
+    t_annotated = time.perf_counter() - t
+    obj_annotated = pulp.value(opt.prob.objective)
+    print(
+        f"annotated + propag: objective {obj_annotated:11.1f}   solve {t_annotated:6.1f}s"
+    )
+
+    gap = 100 * (obj_annotated - obj_baseline) / obj_baseline
+    print(
+        f"\npropagation pinned {result.nodes_determined} nodes "
+        f"({result.axis_constraints} per-axis constraints), shrinking the "
+        f"output-strategy search space by {100 * result.reduction:.1f}% "
+        f"({result.strategies_before} -> {result.strategies_after})"
+    )
+    print(
+        f"objective gap vs full ILP: {gap:+.2f}%   "
+        f"solve speedup: {t_baseline / max(t_annotated, 1e-9):.1f}x"
+    )
diff --git a/tests/test_propagation.py b/tests/test_propagation.py
new file mode 100644
index 00000000..34bb7af2
--- /dev/null
+++ b/tests/test_propagation.py
@@ -0,0 +1,222 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pulp
+import pytest
+import torch
+import torch.nn.functional as F
+from conftest import apply_cuda_patches
+from torch import nn
+from torch._functorch._aot_autograd.fx_utils import get_param_and_grad_nodes
+from torch.distributed.tensor.placement_types import Replicate, Shard
+
+from autoparallel.api import AutoParallel
+from autoparallel.propagation import ShardingAnnotation, ShardingPropagator
+
+
+class TPBlock(nn.Module):
+    """A minimal transformer block: attention + SwiGLU FFN, the structure a
+    column/row-parallel tensor-parallel plan applies to."""
+
+    def __init__(self, dim=512, hidden=1024, nheads=8):
+        super().__init__()
+        self.nheads = nheads
+        self.wq = nn.Linear(dim, dim, bias=False)
+        self.wk = nn.Linear(dim, dim, bias=False)
+        self.wv = nn.Linear(dim, dim, bias=False)
+        self.wo = nn.Linear(dim, dim, bias=False)
+        self.w1 = nn.Linear(dim, hidden, bias=False)
+        self.w2 = nn.Linear(hidden, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden, bias=False)
+
+    def forward(self, x):
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        q = q.unflatten(-1, (self.nheads, -1)).permute(0, 2, 1, 3)
+        k = k.unflatten(-1, (self.nheads, -1)).permute(0, 2, 1, 3)
+        v = v.unflatten(-1, (self.nheads, -1)).permute(0, 2, 1, 3)
+        o = F.scaled_dot_product_attention(q, k, v)
+        o = o.permute(0, 2, 1, 3).flatten(-2)
+        h = self.wo(o) + x
+        return h + self.w2(F.silu(self.w1(h)) * self.w3(h))
+
+
+def _input_fn():
+    bs = 32
+    return torch.randn(bs, 128, 512, device="cuda", requires_grad=True)
+
+
+def _enter_autop(mesh):
+    with torch.device("meta"):
+        model = TPBlock()
+    autop = AutoParallel(model, _input_fn, mesh)
+    autop.__enter__()
+    autop.add_parameter_memory_constraint(low=None, high=None)
+    x_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1)
+    autop.add_input_constraints([x_sharding])
+    autop.add_output_constraints([x_sharding])
+    return autop
+
+
+def _annotate_tp(autop):
+    col, row = (None, Shard(0)), (None, Shard(1))
+    for proj in ["wq", "wk", "wv", "w1", "w3"]:
+        autop.annotate_parameter(f"{proj}.weight", col)
+    for proj in ["wo", "w2"]:
+        autop.annotate_parameter(f"{proj}.weight", row)
+
+
+@apply_cuda_patches
+def test_propagation_matches_full_ilp(device_mesh_2d):
+    """Annotating the TP plan and propagating shrinks the search space while the
+    reduced ILP reaches the same optimum as the full ILP."""
+    autop = _enter_autop(device_mesh_2d)
+    try:
+        opt = autop.sharding_optimizer
+        autop.optimize_placement(verbose=False)
+        obj_full = pulp.value(opt.prob.objective)
+
+        _annotate_tp(autop)
+        result = autop.propagate_annotations(verbose=False)
+        opt.resolve(verbose=False)
+        obj_annotated = pulp.value(opt.prob.objective)
+
+        assert opt.prob.status == 1  # Optimal
+        # Same optimum (propagation only pins reshard-free, unambiguous sharding).
+        assert obj_annotated == pytest.approx(obj_full, rel=1e-6)
+        # And it actually pruned a meaningful chunk of the search space.
+        assert result.reduction > 0.1
+        assert result.nodes_determined > 0
+    finally:
+        autop.__exit__(None, None, None)
+
+
+@apply_cuda_patches
+def test_lp_relaxation_is_integral_and_exact(device_mesh_2d):
+    """The LP relaxation of the sharding ILP is integral here, so solving it is a
+    cheaper exact solve: same objective as the ILP, with an extractable solution."""
+    autop = _enter_autop(device_mesh_2d)
+    try:
+        opt = autop.sharding_optimizer
+        autop.optimize_placement(verbose=False)
+        obj_ilp = pulp.value(opt.prob.objective)
+
+        lp = opt.solve_lp_relaxation(extract=True)
+        assert lp["n_fractional"] == 0  # relaxation is integral
+        assert lp["objective"] == pytest.approx(obj_ilp, rel=1e-6)
+        assert lp["solution"] is not None
+        # one strategy per (single-output) decision node
+        assert len(lp["solution"]) > 0
+    finally:
+        autop.__exit__(None, None, None)
+
+
+@apply_cuda_patches
+def test_axis_constraint_fix_method_matches_constraint(device_mesh_2d):
+    """Pinning an axis by fixing variables gives the same result as the equality
+    constraint, and is exact."""
+    autop = _enter_autop(device_mesh_2d)
+    try:
+        opt = autop.sharding_optimizer
+        fqn = {d.target: n for d, (n, _) in get_param_and_grad_nodes(opt.graph).items()}
+        wq = fqn["wq.weight"]
+        opt.add_node_axis_constraint(wq, mesh_dim=1, placement=Shard(0), method="fix")
+        solution = autop.optimize_placement(verbose=False)
+        assert opt.prob.status == 1
+        placements = solution[opt._concrete_to_orig.get(wq, wq)].output_specs.placements
+        assert placements[1] == Shard(0)
+    finally:
+        autop.__exit__(None, None, None)
+
+
+@apply_cuda_patches
+def test_add_node_axis_constraint_pins_one_axis(device_mesh_2d):
+    """A per-axis constraint pins the chosen mesh axis and leaves the other free."""
+    autop = _enter_autop(device_mesh_2d)
+    try:
+        opt = autop.sharding_optimizer
+        fqn = {d.target: n for d, (n, _) in get_param_and_grad_nodes(opt.graph).items()}
+        wq = fqn["wq.weight"]
+        opt.add_node_axis_constraint(wq, mesh_dim=1, placement=Shard(0))
+        solution = autop.optimize_placement(verbose=False)
+        placements = solution[opt._concrete_to_orig.get(wq, wq)].output_specs.placements
+        # tp axis pinned to Shard(0); dp axis decided by the ILP.
+        assert placements[1] == Shard(0)
+    finally:
+        autop.__exit__(None, None, None)
+
+
+@apply_cuda_patches
+def test_axis_constraint_keeps_param_shardable_for_fsdp(device_mesh_2d):
+    """A per-axis tp constraint must not exclude a parameter from the memory
+    budget: it should still be shardable on the (free) data axis for FSDP."""
+    autop = _enter_autop(device_mesh_2d)
+    try:
+        opt = autop.sharding_optimizer
+        fqn = {d.target: n for d, (n, _) in get_param_and_grad_nodes(opt.graph).items()}
+        wq = fqn["wq.weight"]
+        # Column-parallel on tp; data axis left open.
+        opt.add_node_axis_constraint(wq, mesh_dim=1, placement=Shard(0))
+        solution = autop.optimize_placement(verbose=False)
+        assert opt.prob.status == 1  # feasible despite the tight memory budget
+        placements = solution[opt._concrete_to_orig.get(wq, wq)].output_specs.placements
+        # FSDP shards the data axis too (tight 1/world_size budget).
+        assert placements[0] == Shard(0)
+        assert placements[1] == Shard(0)
+    finally:
+        autop.__exit__(None, None, None)
+
+
+@apply_cuda_patches
+def test_seed_unachievable_raises(device_mesh_2d):
+    autop = _enter_autop(device_mesh_2d)
+    try:
+        opt = autop.sharding_optimizer
+        prop = ShardingPropagator(opt)
+        fqn = {d.target: n for d, (n, _) in get_param_and_grad_nodes(opt.graph).items()}
+        wq = fqn["wq.weight"]
+        # wq.weight is 2D; sharding a non-existent tensor dim 5 is impossible.
+        with pytest.raises(ValueError):
+            prop.seed(wq, (None, Shard(5)))
+    finally:
+        autop.__exit__(None, None, None)
+
+
+@apply_cuda_patches
+def test_propagation_determines_matmul_outputs(device_mesh_2d):
+    """Seeding the column-parallel weights determines the tp axis of the matmul
+    outputs (sharded on the output feature) with no resharding."""
+    autop = _enter_autop(device_mesh_2d)
+    try:
+        opt = autop.sharding_optimizer
+        prop = ShardingPropagator(opt)
+        annotations = []
+        fqn = {d.target: n for d, (n, _) in get_param_and_grad_nodes(opt.graph).items()}
+        for proj in ["wq", "wk", "wv", "w1", "w3"]:
+            annotations.append(
+                (fqn[f"{proj}.weight"], ShardingAnnotation((None, Shard(0)), 1))
+            )
+        for proj in ["wo", "w2"]:
+            annotations.append(
+                (fqn[f"{proj}.weight"], ShardingAnnotation((None, Shard(1)), 1))
+            )
+        determined = prop.run(annotations)
+
+        # Every column-parallel matmul output should be tp-sharded (not replicated).
+        einsum_nodes = opt.graph.find_nodes(
+            op="call_function", target=torch.ops.aten.einsum.default
+        )
+        if not einsum_nodes:
+            einsum_nodes = opt.graph.find_nodes(
+                op="call_function", target=torch.ops.aten.mm.default
+            )
+        n_tp_pinned = 0
+        for n in einsum_nodes:
+            if n in determined:
+                tp = dict(determined[n]).get(1)
+                if isinstance(tp, Shard):
+                    n_tp_pinned += 1
+        assert n_tp_pinned > 0
+    finally:
+        autop.__exit__(None, None, None)

From c33c0ef2b87400882fa0f51e322cdaa87ac29e17 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sat, 30 May 2026 18:55:04 -0700
Subject: [PATCH 09/27] Integrate prune + dp_solver + annotated into a joint
 optimization

Make the approximate (dp) solver work with the pruned search space and the
propagated per-axis annotations, the two pieces neither branch had on its own:

- Pruning removes infinite-cost edges from decision_vars entirely, so the
  approx solver must treat a key absent from decision_vars as forbidden
  (_is_forbidden) and read per-strategy costs from any surviving inp_idx
  (_surviving_dv). Applied across the forbidden checks and decision_var reads.
- Replay add_node_axis_constraint from _constraint_log in both the PuLP and
  the lite (no-PuLP) topology paths so propagated Shard pins restrict the
  approx solver's per-node out_idx domain (method="fix" leaves no PuLP row).
- Port the forward param-dtype constraint (current main) into _topology_direct
  so the lite build matches the full build exactly under mixed precision.
- Guard _fix_node_output_indices / add_node_axis_constraint against pruned
  (None) variables and the lite build.

Authored with Claude.
---
 autoparallel/approximate_sharding.py | 132 ++++++++++++++++++++++++---
 autoparallel/optimize_sharding.py    |   2 +
 2 files changed, 123 insertions(+), 11 deletions(-)

diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py
index 7e1a945a..146effcf 100644
--- a/autoparallel/approximate_sharding.py
+++ b/autoparallel/approximate_sharding.py
@@ -427,6 +427,11 @@ def _parse_constraints(self):
                         (next(iter(na)), next(iter(nb)),
                          frozenset({(next(iter(oa)), next(iter(ob)))}))
                     )
+        # method="fix" axis pins leave no PuLP row to parse above, so replay the
+        # log to recover them (constraint-method pins are also picked up here,
+        # idempotently with their == 1 rows).
+        for n, out_set in self._axis_restrict_from_log().items():
+            restrict[n] = restrict.get(n, out_set) & out_set
         for n, out_set in restrict.items():
             if n in self.allowed_out:
                 self.allowed_out[n] = [o for o in self.allowed_out[n] if o in out_set]
@@ -469,7 +474,39 @@ def nroot(idx):
             if not math.isfinite(dv.cost) or dv.cost == 10000.0:
                 self.forbidden.add(key)
 
-        # 2. grad-reduce-dtype forbidden (== add_grad_reduce_dtype_constraints).
+        # 2a. forward param-dtype forbidden (== add_grad_reduce_dtype_constraints
+        #     forward part, unconditional). Force the FSDP allgather to run after
+        #     a downcasting param dtype_cast (in the smaller param_dtype) by
+        #     forbidding any pre-cast redistribution.
+        cast_op = torch.ops.autoparallel.dtype_cast.default
+        fwd_pre_cast: set[int] = set()
+        for param, _grad in get_param_and_grad_nodes(opt.graph).values():
+            n = param
+            while True:
+                if n.target == cast_op:
+                    break
+                users = list(n.users.keys())
+                if len(users) != 1:
+                    break
+                child = users[0]
+                if len(child.all_input_nodes) != 1:
+                    break
+                n = child
+            if n.target != cast_op:
+                continue
+            if n.meta["val"].dtype.itemsize >= param.meta["val"].dtype.itemsize:
+                continue  # only constrain downcasts
+            node = n
+            while node != param:
+                if node in opt.node_map:
+                    fwd_pre_cast.add(opt.node_map[node])
+                node = node.all_input_nodes[0]
+        for key, dv in opt.decision_vars.items():
+            if key[0] in fwd_pre_cast and dv.comm_cost > 0:
+                self.forbidden.add(key)
+
+        # 2. grad-reduce-dtype (backward) forbidden
+        #    (== add_grad_reduce_dtype_constraints backward part).
         if getattr(opt, "force_grad_reduce_in_higher_precision", False):
             cast_op = torch.ops.autoparallel.dtype_cast.default
             pre_cast: set[int] = set()
@@ -553,6 +590,12 @@ def add_paired(node_a, node_b):
                             break
             r = nroot(opt.node_map[node])
             restrict[r] = restrict.get(r, out_set) & out_set
+        # 4b. per-axis placement restrictions (== add_node_axis_constraint), what
+        #     sharding propagation emits. With method="fix" these leave no PuLP
+        #     row to parse, so replaying the log is the only way the approx solver
+        #     sees the pin.
+        for r, out_set in self._axis_restrict_from_log().items():
+            restrict[r] = restrict.get(r, out_set) & out_set
         for n_idx, out_set in restrict.items():
             if n_idx in self.allowed_out:
                 self.allowed_out[n_idx] = [
@@ -585,10 +628,69 @@ def add_paired(node_a, node_b):
 
         return paired_edges, authoritative
 
+    def _axis_restrict_from_log(self):
+        """out_idx restrictions implied by add_node_axis_constraint calls,
+        replayed from _constraint_log → {root_node_idx: set(out_idx)}.
+
+        This is how the approximate solver honors propagated per-axis pins: keep
+        only the strategies whose output placement matches the pinned axis,
+        exactly like ShardingOptimizer.add_node_axis_constraint. It works whether
+        the pin was applied as a PuLP row ("constraint") or as variable bounds
+        ("fix", which leaves no row to parse) and in the lite (no-PuLP) build."""
+        opt = self.opt
+        node_root = {lk[0]: rk[0] for lk, rk in opt.cluster_links.items()}
+        restrict: dict[int, set] = {}
+        for fname, kwargs in getattr(opt, "_constraint_log", []):
+            if fname != "add_node_axis_constraint":
+                continue
+            node = next(
+                (nd for nd in opt.nodes if nd.name == kwargs["node_name"]), None
+            )
+            if node is None or node not in opt.strats:
+                continue
+            mesh_dim, placement = kwargs["mesh_dim"], kwargs["placement"]
+            out_set = set()
+            for i, s in enumerate(opt.strats[node].strategies):
+                specs = s.output_specs
+                if isinstance(specs, DTensorSpec):
+                    spec = specs
+                elif isinstance(specs, (list, tuple)):
+                    spec = next((x for x in specs if isinstance(x, DTensorSpec)), None)
+                else:
+                    spec = None
+                if spec is not None and spec.placements[mesh_dim] == placement:
+                    out_set.add(i)
+            r = node_root.get(opt.node_map[node], opt.node_map[node])
+            restrict[r] = restrict.get(r, out_set) & out_set
+        return restrict
+
+    def _is_forbidden(self, key) -> bool:
+        """A strategy edge is forbidden if a constraint ruled it out OR it was
+        pruned for infinite cost. Pruning removes such keys from decision_vars
+        entirely (see ShardingOptimizer._build_decision_vars), so a key missing
+        from decision_vars is just as forbidden as one in ``self.forbidden``."""
+        return key in self.forbidden or key not in self.opt.decision_vars
+
+    def _surviving_dv(self, v, argi, o):
+        """A DecisionVar for (v, argi, o, *) using any inp_idx that survived
+        pruning, or None if every edge for that (arg, out) was pruned.
+        compute_cost / input_spec are identical across inp_idx for a fixed out."""
+        strat = self.opt.strats[self.opt.nodes[v]].strategies[o]
+        n_inp = (
+            len(strat.redistribute_cost[argi])
+            if argi < len(strat.redistribute_cost)
+            else 1
+        )
+        for inp in range(n_inp):
+            dv = self.opt.decision_vars.get((v, argi, o, inp))
+            if dv is not None:
+                return dv
+        return None
+
     def _out_fully_forbidden(self, v, node, o):
         strat = self.opt.strats[node].strategies[o]
         for argi, costs in enumerate(strat.redistribute_cost):
-            if all((v, argi, o, inp) in self.forbidden for inp in range(len(costs))):
+            if all(self._is_forbidden((v, argi, o, inp)) for inp in range(len(costs))):
                 return True
         return False
 
@@ -736,13 +838,16 @@ def _choice_lower_bound(self, v, node, o):
         opt = self.opt
         strat = opt.strats[node].strategies[o]
         mult = self.node_mult[v]
-        lb = opt.decision_vars[(v, 0, o, 0)].compute_cost * len(strat.redistribute_cost)
+        dv0 = self._surviving_dv(v, 0, o)
+        if dv0 is None:
+            return INF  # every edge for this output strategy was pruned
+        lb = dv0.compute_cost * len(strat.redistribute_cost)
         lb *= mult
         for argi, _p in self.input_edges.get(v, []):
             best = INF
             for inp in range(len(strat.redistribute_cost[argi])):
                 key = (v, argi, o, inp)
-                if key in self.forbidden:
+                if self._is_forbidden(key):
                     continue
                 dv = opt.decision_vars[key]
                 best = min(best, dv.comm_cost + dv.sharding_transition_cost)
@@ -797,7 +902,7 @@ def _build_memory_info(self):
         }
 
     def _param_ratio(self, v, node, o):
-        spec = self.opt.decision_vars[(v, 0, o, 0)].input_spec
+        spec = self._surviving_dv(v, 0, o).input_spec
         new_shape, _ = _get_sharded_shape_stride(spec)
         return math.prod(new_shape) / math.prod(spec.tensor_meta.shape)
 
@@ -859,7 +964,10 @@ def _self_cost_vec(self, m, out_indices):
         for i, o in enumerate(out_indices):
             strat = opt.strats[node].strategies[o]
             n_args = len(strat.redistribute_cost)
-            dv0 = opt.decision_vars[(m, 0, o, 0)]
+            dv0 = self._surviving_dv(m, 0, o)
+            if dv0 is None:  # whole output strategy pruned
+                out[i] = BIG
+                continue
             c = dv0.compute_cost * n_args
             # Args with no flow edge (constructors / None-spec) are scored at
             # inp=0 here; args with a producer are charged via the pairwise edges.
@@ -867,7 +975,7 @@ def _self_cost_vec(self, m, out_indices):
                 if argi in prod:
                     continue
                 key = (m, argi, o, 0)
-                if key in self.forbidden:
+                if self._is_forbidden(key):
                     c = BIG
                     break
                 dv = opt.decision_vars[key]
@@ -890,7 +998,7 @@ def _edge_matrix(self, v, argi, p):
         for ov in ov_vals:
             for op in op_vals:
                 key = (v, argi, ov, op)
-                if key in self.forbidden:
+                if self._is_forbidden(key):
                     continue
                 dv = opt.decision_vars[key]
                 R[ov, op] = dv.comm_cost + dv.sharding_transition_cost
@@ -1181,7 +1289,7 @@ def total_objective(self):
                 p = prod.get(argi)
                 inp = self.cur_out[p] if p is not None else 0
                 key = (v, argi, o, inp)
-                if key in self.forbidden:
+                if self._is_forbidden(key):
                     return INF
                 c += self.opt.decision_vars[key].cost
             total += self.node_mult[v] * c
@@ -1204,9 +1312,11 @@ def _write_back(self):
                 p = prod.get(argi)
                 inp = self.cur_out[p] if p is not None else 0
                 key = (v, argi, o, inp)
-                if key in self.forbidden:
+                if self._is_forbidden(key):
                     feasible = False
-                if has_pulp:
+                # A pruned key has no PuLP variable; the infeasible flag above
+                # already records it (and raises in _solve).
+                if has_pulp and key in opt.pulp_variables:
                     opt.pulp_variables[key].varValue = 1
                 selected.append(key)
         opt.selected_keys = list(selected)
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 474675ca..fd6a256c 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -2549,6 +2549,8 @@ def _fix_node_output_indices(self, node, keep_out_idxs):
             if out_idx in keep_out_idxs:
                 continue
             var = self._get_pulp_variable((node_idx, argi, out_idx, inp_idx))
+            if var is None:  # pruned (invalid) strategy edge, or lite (no-PuLP) build
+                continue
             if var.upBound != 0:
                 var.upBound = 0
                 self._fixed_vars.append(var)

From f7af13590ae7c38059a843b0bedfe32fa9ec96e8 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sat, 30 May 2026 19:32:12 -0700
Subject: [PATCH 10/27] Fix loaded-optimizer resolve() under dp_solver
 profiling

A loaded optimizer (ShardingOptimizer.load) is built via __new__ and never ran
the dp_solver init-time profiling, so resolve()/get_solution() -> _log_solve_profile
hit a missing self.profile. Guard the solve profiler to no-op without init
timings, and initialize profile/build_pulp/_node_axis_constraints/_fixed_vars in
load_optimizer so loaded optimizers carry the full attribute set.

Authored with Claude.
---
 autoparallel/optimize_sharding.py | 5 +++++
 autoparallel/serialization.py     | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index fd6a256c..fd46b196 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -1495,6 +1495,11 @@ def _log_solve_profile(
         extract_s,
         total_s,
     ):
+        # Optimizers loaded from a save file skip init-time profiling; there is
+        # nothing to extend, and the phase timings below are absent.
+        profile = getattr(self, "profile", None)
+        if not profile or "init_total_s" not in profile.get("timings", {}):
+            return
         mesh = self.profile["mesh"]
         model = self.profile["model"]
         timings = self.profile["timings"]
diff --git a/autoparallel/serialization.py b/autoparallel/serialization.py
index 00bc6dcf..dfd953b1 100644
--- a/autoparallel/serialization.py
+++ b/autoparallel/serialization.py
@@ -264,7 +264,13 @@ def load_optimizer(cls, path):
     opt._constraint_log = []
     opt._memory_constraint = None
     opt._node_constraint_names = {}
+    opt._node_axis_constraints = defaultdict(list)
+    opt._fixed_vars = []
     opt._name_counters = {}
+    # Loaded optimizers rebuild the PuLP problem below but carry no init-time
+    # profiling; an empty profile lets solve-time profile writes/guards no-op.
+    opt.build_pulp = True
+    opt.profile = {"timings": {}}
 
     # Reconstruct cluster_links by expanding the node-level mapping over
     # all (argi, out_idx, inp_idx) combinations.

From b767f2d29dcc8ce0dc8f10cf92ec2adb667ee2d3 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sat, 30 May 2026 19:41:47 -0700
Subject: [PATCH 11/27] Apply the memory constraint in get_lower_bound

The LP relaxation lower bound must include the parameter-memory budget, or it
bounds a different (unconstrained) problem and reads below the true ILP optimum.
With the fix the LP bound equals the exact constrained optimum on LLaMA3-1B,
making it a tight optimality certificate (used for the 3D gap, where the ILP is
intractable).

Authored with Claude.
---
 autoparallel/optimize_sharding.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index fd46b196..8d0de31d 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -1383,6 +1383,10 @@ def get_lower_bound(self, verbose=False):
         try:
             if self.prob.objective is None:
                 self._set_objective()
+            # The relaxation must include the parameter-memory constraint, or it
+            # is a lower bound on a different (unconstrained) problem and can fall
+            # below the true ILP optimum.
+            self._apply_memory_constraint()
 
             for var in self.pulp_variables.values():
                 var.cat = pulp.LpContinuous

From 6fcf8443c8f4c92444d367ea585c7a0fa60f7a66 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sat, 30 May 2026 19:46:21 -0700
Subject: [PATCH 12/27] Add joint-optimization benchmark for LLaMA3 on 2D/3D
 meshes

_bench_merge.py compares the four configurations (prune ILP, annotated ILP,
prune+dp approx, prune+dp+annotated) on one traced model, reporting per-phase
timings, objectives, the LP-relaxation optimality certificate, and the
acceptance checks. _bench_dp_alone.py isolates the approx-without-prune baseline
(run against the dp_solver checkout) for the dp-alone comparison.

Authored with Claude.
---
 examples/_bench_dp_alone.py |  86 +++++++++++
 examples/_bench_merge.py    | 281 ++++++++++++++++++++++++++++++++++++
 2 files changed, 367 insertions(+)
 create mode 100644 examples/_bench_dp_alone.py
 create mode 100644 examples/_bench_merge.py

diff --git a/examples/_bench_dp_alone.py b/examples/_bench_dp_alone.py
new file mode 100644
index 00000000..10c026ec
--- /dev/null
+++ b/examples/_bench_dp_alone.py
@@ -0,0 +1,86 @@
+"""Minimal approx-solver timing, for the 'dp alone' (approx WITHOUT prune)
+baseline. Run it with PYTHONPATH pointing at the dp_solver checkout to get the
+unpruned numbers, and at the merge checkout to cross-check prune+dp.
+
+Reports lite-build time, approx solve time, decision-var count and objective for
+LLaMA3-1B with the canonical constraints. Env: MESH, SEQLEN, N_LAYERS.
+"""
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+import autoparallel
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+N_LAYERS = int(os.environ.get("N_LAYERS", "0"))
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+
+
+def model_fn():
+    args = TransformerModelArgs(
+        dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5,
+        multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN)
+    if N_LAYERS:
+        args.n_layers = N_LAYERS
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+x = (Shard(0),) + (Replicate(),) * (ndim - 1)
+out = (Shard(0), Shard(2)) if ndim == 2 else x
+
+print(f"autoparallel = {autoparallel.__file__}", flush=True)
+print(f"=== dp-alone (approx) LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} "
+      f"layers={N_LAYERS or 16} ===", flush=True)
+
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
+autop.__enter__()
+autop.add_parameter_memory_constraint(low=None, high=None)
+autop.add_input_constraints([x])
+autop.add_output_constraints([out])
+t_build = time.perf_counter() - t
+opt = autop.sharding_optimizer
+
+t = time.perf_counter()
+ApproximateShardingSolver(opt).get_solution(verbose=False)
+t_solve = time.perf_counter() - t
+obj = opt.profile["approximate"]["objective"]
+
+print(f"[dp-alone] build={t_build:.2f}s  approx_solve={t_solve:.2f}s  "
+      f"total={t_build + t_solve:.2f}s  obj={obj:.1f}  "
+      f"decision_vars={len(opt.decision_vars)}", flush=True)
diff --git a/examples/_bench_merge.py b/examples/_bench_merge.py
new file mode 100644
index 00000000..e5bc6c5a
--- /dev/null
+++ b/examples/_bench_merge.py
@@ -0,0 +1,281 @@
+"""Joint-optimization benchmark: prune (+ annotated) + dp (approx) vs each alone.
+
+Measures, for LLaMA3-1B on a 2D or 3D mesh with the canonical example_llama3
+constraints, four optimization configurations on the SAME traced model:
+
+  prune     : full ILP build  + exact CBC solve            (== prune_search_space)
+  annotated : full ILP build  + propagate(fix) + CBC solve (== annotated_search)
+  dp        : lite build      + approx solve               (== dp_solver)
+  merged    : lite build      + propagate(fix) + approx    (this branch)
+
+Reports each config's build/solve/total time and objective, the LP-relaxation
+lower bound (an optimality certificate), and checks the acceptance criteria:
+
+  * merged objective within 10% (ideally 5%) of the ILP optimum, and
+  * merged total time < every individual optimization's total time.
+
+Env knobs: MESH ("8,8" 2D / "2,4,8" 3D), ILP_TIMEOUT (s, 0=unlimited),
+N_LAYERS (0=default 16), SEQLEN.
+"""
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import pulp
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+
+
+def log(msg=""):
+    print(msg, flush=True)
+
+
+# Fake an 8-GPU H100 node so the cost model runs without real GPUs.
+_PATCHES = [
+    patch("torch.cuda.device_count", lambda: 8),
+    patch("torch.cuda.get_device_name", lambda *a, **k: "H100"),
+    patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)),
+    patch(
+        "torch.cuda.get_device_properties",
+        lambda *a, **k: type(
+            "P", (), {"major": 9, "minor": 0, "name": "H100",
+                      "total_memory": 80 * 1024**3, "multi_processor_count": 132}
+        )(),
+    ),
+]
+for p in _PATCHES:
+    p.start()
+
+N_LAYERS = int(os.environ.get("N_LAYERS", "0"))
+SEQLEN = int(os.environ.get("SEQLEN", str(2048)))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
+ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "0"))
+
+world_size = 1
+for d in MESH_SHAPE:
+    world_size *= d
+_NAMES = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}
+mesh_names = _NAMES[len(MESH_SHAPE)]
+fake_store = FakeStore()
+torch.distributed.init_process_group(
+    "fake", store=fake_store, rank=0, world_size=world_size
+)
+mesh = torch.distributed.device_mesh.init_device_mesh(
+    "cuda", MESH_SHAPE, mesh_dim_names=mesh_names
+)
+ndim = mesh.ndim
+
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+seqlen = SEQLEN
+
+
+def model_fn():
+    args = TransformerModelArgs(
+        dim=2048, n_layers=16, n_heads=32, n_kv_heads=8,
+        ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000,
+        vocab_size=vocab_size, max_seq_len=seqlen,
+    )
+    if N_LAYERS:
+        args.n_layers = N_LAYERS
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda")
+
+
+# Canonical TP plan: column-parallel q/k/v/w1/w3, row-parallel wo/w2, pinning
+# only the tensor-parallel (last) mesh axis; data/cp axes left to the optimizer.
+COLUMN_PARALLEL = (None,) * (ndim - 1) + (Shard(0),)
+ROW_PARALLEL = (None,) * (ndim - 1) + (Shard(1),)
+
+
+def annotate_tp_plan(autop):
+    for proj in ["wq", "wk", "wv"]:
+        autop.annotate_parameter(f"layers.*.attention.{proj}.weight", COLUMN_PARALLEL)
+    autop.annotate_parameter("layers.*.attention.wo.weight", ROW_PARALLEL)
+    for proj in ["w1", "w3"]:
+        autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", COLUMN_PARALLEL)
+    autop.annotate_parameter("layers.*.feed_forward.w2.weight", ROW_PARALLEL)
+
+
+def add_constraints(autop):
+    x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1)
+    out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding
+    autop.add_parameter_memory_constraint(low=None, high=None)
+    autop.add_input_constraints([x_sharding])
+    autop.add_output_constraints([out_sharding])
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+
+log(f"=== LLaMA3-1B  mesh={MESH_SHAPE}{mesh_names}  world={world_size}  "
+    f"seqlen={seqlen}  layers={N_LAYERS or 16} ===")
+results = {}  # name -> dict(build, solve, total, obj)
+
+
+def build(build_pulp):
+    t = time.perf_counter()
+    autop = AutoParallel(
+        model_fn(), input_fn, mesh, mp, repeated_subgraphs=True,
+        solver="ilp" if build_pulp else "approx",
+    )
+    autop.__enter__()
+    add_constraints(autop)
+    return autop, time.perf_counter() - t
+
+
+# ---------- full PuLP build: prune (ILP) + annotated (ILP) + LP bound ----------
+autop_full, build_full = build(build_pulp=True)
+opt = autop_full.sharding_optimizer
+log(f"\n[full build] {build_full:.2f}s  decision_vars={len(opt.decision_vars)}  "
+    f"pulp_vars={len(opt.pulp_variables)}  constraints={len(opt.prob.constraints)}")
+
+# prune: exact ILP solve. preprocess-off is part of the prune optimization, and
+# _apply_memory_constraint installs the same budget the approx solver enforces,
+# so every config solves the identical constrained problem.
+opt._set_objective()
+opt._apply_memory_constraint()
+kw = {"msg": False, "options": ["preprocess off"]}
+if ILP_TIMEOUT > 0:
+    kw["timeLimit"] = ILP_TIMEOUT
+t = time.perf_counter()
+opt.prob.solve(pulp.PULP_CBC_CMD(**kw))
+t_ilp = time.perf_counter() - t
+obj_opt = pulp.value(opt.prob.objective)
+ilp_status = pulp.LpStatus[opt.prob.status]
+results["prune"] = dict(build=build_full, solve=t_ilp, total=build_full + t_ilp,
+                        obj=obj_opt)
+log(f"[prune    ] ILP solve {t_ilp:8.2f}s  obj={obj_opt:11.1f}  status={ilp_status}")
+
+# LP-relaxation lower bound: certifies the optimality gap without a full ILP
+# (this sharding LP is empirically integral, so the bound equals the optimum).
+lb_res = opt.get_lower_bound(verbose=False)
+lb = lb_res.objective
+log(f"[LP-bound ] solve {lb_res.solve_s:8.2f}s  lower_bound={lb:11.1f}")
+
+# annotated: propagate the TP plan, then exact ILP solve on the reduced problem.
+annotate_tp_plan(autop_full)
+t = time.perf_counter()
+prop = autop_full.propagate_annotations(verbose=False, method="fix")
+t_prop_full = time.perf_counter() - t
+opt._apply_memory_constraint()
+t = time.perf_counter()
+opt.prob.solve(pulp.PULP_CBC_CMD(**kw))
+t_ilp_ann = time.perf_counter() - t
+obj_ann = pulp.value(opt.prob.objective)
+results["annotated"] = dict(build=build_full, solve=t_prop_full + t_ilp_ann,
+                            total=build_full + t_prop_full + t_ilp_ann, obj=obj_ann)
+log(f"[annotated] propagate {t_prop_full:.2f}s + ILP {t_ilp_ann:.2f}s  "
+    f"obj={obj_ann:11.1f}  (pinned {prop.nodes_determined} nodes, "
+    f"-{100*prop.reduction:.0f}% strategies)")
+
+# Tear down before the next build: AutoParallel installs a FakeTensorMode, and
+# two entered instances can't coexist.
+autop_full.__exit__(None, None, None)
+
+# ---------- lite build: dp=prune+approx + merged=prune+approx+annotated -------
+autop_lite, build_lite = build(build_pulp=False)
+opt_l = autop_lite.sharding_optimizer
+log(f"\n[lite build] {build_lite:.2f}s  decision_vars={len(opt_l.decision_vars)}  "
+    f"pulp_vars={len(opt_l.pulp_variables)} (no PuLP problem)")
+
+# dp: approximate solve, no annotations.
+t = time.perf_counter()
+ApproximateShardingSolver(opt_l).get_solution(verbose=False)
+t_approx_dp = time.perf_counter() - t
+obj_dp = opt_l.profile["approximate"]["objective"]
+results["dp"] = dict(build=build_lite, solve=t_approx_dp, total=build_lite + t_approx_dp,
+                     obj=obj_dp)
+log(f"[dp       ] approx solve {t_approx_dp:8.2f}s  obj={obj_dp:11.1f}")
+
+# merged: propagate the TP plan, then approximate solve on the reduced problem.
+annotate_tp_plan(autop_lite)
+t = time.perf_counter()
+prop_l = autop_lite.propagate_annotations(verbose=False, method="fix")
+t_prop_lite = time.perf_counter() - t
+t = time.perf_counter()
+ApproximateShardingSolver(opt_l).get_solution(verbose=False)
+t_approx_merged = time.perf_counter() - t
+obj_merged = opt_l.profile["approximate"]["objective"]
+results["merged"] = dict(build=build_lite, solve=t_prop_lite + t_approx_merged,
+                         total=build_lite + t_prop_lite + t_approx_merged, obj=obj_merged)
+log(f"[merged   ] propagate {t_prop_lite:.2f}s + approx {t_approx_merged:.2f}s  "
+    f"obj={obj_merged:11.1f}  (pinned {prop_l.nodes_determined} nodes)")
+
+autop_lite.__exit__(None, None, None)
+
+# ---------- report ----------
+# Optimality reference: exact ILP optimum if CBC proved it, else the LP lower
+# bound (this sharding LP is empirically integral, so lb == optimum).
+optimal = obj_opt if ilp_status == "Optimal" else lb
+opt_label = "ILP optimum" if ilp_status == "Optimal" else "LP lower bound"
+
+LABELS = {
+    "prune": "prune (ILP)",
+    "annotated": "annotated (ILP)",
+    "dp": "prune+dp (approx)",
+    "merged": "prune+dp+anno",
+}
+log("\n" + "=" * 78)
+log(f"{'config':<20}{'build(s)':>10}{'solve(s)':>10}{'total(s)':>10}"
+    f"{'objective':>13}{'gap%':>9}")
+log("-" * 78)
+for name in ["prune", "annotated", "dp", "merged"]:
+    r = results[name]
+    gap = 100 * (r["obj"] - optimal) / optimal
+    log(f"{LABELS[name]:<20}{r['build']:>10.2f}{r['solve']:>10.2f}{r['total']:>10.2f}"
+        f"{r['obj']:>13.1f}{gap:>+9.2f}")
+log("=" * 78)
+log(f"optimality reference: {opt_label} = {optimal:.1f}  (ILP status={ilp_status})")
+
+# Core joint optimization is prune + dp (the approximate solver on the pruned
+# space); annotation is the optional extra speedup. Report both gaps.
+gap_core = 100 * (obj_dp - optimal) / optimal
+gap_full = 100 * (obj_merged - optimal) / optimal
+log(f"\nobjective gap vs {opt_label}:")
+log(f"  prune+dp (approx)      : {gap_core:+.2f}%   (core: prune + dp)")
+log(f"  prune+dp+annotated     : {gap_full:+.2f}%   (+ optional annotation)")
+
+# Timing: the joint solver must beat each ILP-based individual optimization.
+# (dp alone == approx WITHOUT prune is measured against the dp_solver checkout
+#  separately; prune makes the joint build/solve strictly cheaper than that.)
+log("\njoint total time (build+solve) vs each individual optimization:")
+all_faster = True
+for joint in ["dp", "merged"]:
+    tj = results[joint]["total"]
+    line_ok = True
+    for name in ["prune", "annotated"]:
+        to = results[name]["total"]
+        faster = tj < to
+        line_ok = line_ok and faster
+        log(f"  {LABELS[joint]:<18} {tj:7.2f}s  {'<' if faster else '>='} "
+            f"{LABELS[name]:<16} {to:7.2f}s   {to / tj:5.1f}x  "
+            f"{'OK' if faster else 'FAIL'}")
+    all_faster = all_faster and line_ok
+
+log("\n" + "=" * 78)
+# The full three-way joint (prune + dp + annotated) is the deliverable: the
+# approx solver alone is ~20% off, but the propagated TP plan steers it to the
+# optimum. Annotation is therefore what meets the accuracy bar; prune+dp alone
+# trades accuracy for a little more speed.
+ok_gap = abs(gap_full) <= 10.0
+log(f"ACCEPTANCE gap<=10% (full joint prune+dp+anno): {ok_gap}  "
+    f"(full={gap_full:+.2f}%, <=5%: {abs(gap_full) <= 5.0})")
+log(f"  (informational: prune+dp without annotation = {gap_core:+.2f}%)")
+log(f"ACCEPTANCE joint faster than ILP-based optimizations: {all_faster}")
+log(f"OVERALL: {'PASS' if ok_gap and all_faster else 'CHECK'}")

From e8689cdc8c5c0327ba91ad9863bd7c3533c58633 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sat, 30 May 2026 20:41:18 -0700
Subject: [PATCH 13/27] Extend benches for 3D: MODEL=small, MERGED flag,
 LP-bound certificate

At full-1B 3D scale the PuLP problem has ~8M binary variables (strategy count is
rank x mesh-dims, independent of tensor size and -- via clustering -- of layer
count), so the exact ILP is intractable. _bench_3d_cert.py certifies the merged
gap on full 3D via the LP-relaxation lower bound (tight: it equals the exact
optimum on 2D). _bench_dp_alone.py gains a MERGED flag (annotate+propagate) and
_bench_merge.py a MODEL=small mode.

Authored with Claude.
---
 examples/_bench_3d_cert.py  | 108 ++++++++++++++++++++++++++++++++++++
 examples/_bench_dp_alone.py |  23 +++++++-
 examples/_bench_merge.py    |  28 +++++++---
 3 files changed, 148 insertions(+), 11 deletions(-)
 create mode 100644 examples/_bench_3d_cert.py

diff --git a/examples/_bench_3d_cert.py b/examples/_bench_3d_cert.py
new file mode 100644
index 00000000..0f5bcdc5
--- /dev/null
+++ b/examples/_bench_3d_cert.py
@@ -0,0 +1,108 @@
+"""3D optimality certificate for the merged solver on full LLaMA3-1B.
+
+The 3D ILP has ~8M binary variables; the exact CBC solve is impractical (a 2.6 GB
+MPS file). The LP relaxation, however, is empirically integral for this problem
+(verified on 2D, where it equals the exact optimum), so its objective is a tight
+lower bound on the ILP optimum. This script does ONE full PuLP build, then:
+
+  1. get_lower_bound()  -> LP lower bound (the optimality reference)
+  2. annotate + propagate + ApproximateShardingSolver  -> merged objective
+
+and reports the certified gap = (merged - lb) / lb. Slow (one ~13min build + a
+multi-minute LP solve) but a one-shot 3D certificate. Env: MESH, SEQLEN.
+"""
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+
+def log(m=""):
+    print(m, flush=True)
+
+
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+
+
+def model_fn():
+    args = TransformerModelArgs(
+        dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5,
+        multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN)
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+x = (Shard(0),) + (Replicate(),) * (ndim - 1)
+out = (Shard(0), Shard(2)) if ndim == 2 else x
+
+log(f"=== 3D cert: LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===")
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
+autop.__enter__()
+autop.add_parameter_memory_constraint(low=None, high=None)
+autop.add_input_constraints([x])
+autop.add_output_constraints([out])
+opt = autop.sharding_optimizer
+log(f"[build] {time.perf_counter()-t:.1f}s  decision_vars={len(opt.decision_vars)}  "
+    f"pulp_vars={len(opt.pulp_variables)}")
+
+# LP-relaxation lower bound = optimality reference (the exact ILP is intractable).
+opt._set_objective()
+t = time.perf_counter()
+lb = opt.get_lower_bound(verbose=False).objective
+log(f"[LP-bound] {time.perf_counter()-t:.1f}s  lower_bound={lb:.1f}")
+
+# Merged solver on the same build: propagate the TP plan, then approx-solve.
+cp = (None,) * (ndim - 1) + (Shard(0),)
+rp = (None,) * (ndim - 1) + (Shard(1),)
+for proj in ["wq", "wk", "wv"]:
+    autop.annotate_parameter(f"layers.*.attention.{proj}.weight", cp)
+autop.annotate_parameter("layers.*.attention.wo.weight", rp)
+for proj in ["w1", "w3"]:
+    autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", cp)
+autop.annotate_parameter("layers.*.feed_forward.w2.weight", rp)
+autop.propagate_annotations(verbose=False, method="fix")
+t = time.perf_counter()
+ApproximateShardingSolver(opt).get_solution(verbose=False)
+merged = opt.profile["approximate"]["objective"]
+log(f"[merged] approx {time.perf_counter()-t:.1f}s  objective={merged:.1f}")
+
+gap = 100 * (merged - lb) / lb
+log(f"\n=== 3D certified gap = {gap:+.2f}%  (merged {merged:.1f} vs LP lower bound "
+    f"{lb:.1f}) ===")
+log(f"acceptance gap<=10%: {abs(gap)<=10}  (<=5%: {abs(gap)<=5})")
diff --git a/examples/_bench_dp_alone.py b/examples/_bench_dp_alone.py
index 10c026ec..4b67c3a1 100644
--- a/examples/_bench_dp_alone.py
+++ b/examples/_bench_dp_alone.py
@@ -76,11 +76,28 @@ def input_fn():
 t_build = time.perf_counter() - t
 opt = autop.sharding_optimizer
 
+# With MERGED=1, add the propagated TP plan before solving (full joint solver).
+t_prop = 0.0
+label = "dp-alone"
+if os.environ.get("MERGED") == "1":
+    label = "merged"
+    cp = (None,) * (ndim - 1) + (Shard(0),)
+    rp = (None,) * (ndim - 1) + (Shard(1),)
+    for proj in ["wq", "wk", "wv"]:
+        autop.annotate_parameter(f"layers.*.attention.{proj}.weight", cp)
+    autop.annotate_parameter("layers.*.attention.wo.weight", rp)
+    for proj in ["w1", "w3"]:
+        autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", cp)
+    autop.annotate_parameter("layers.*.feed_forward.w2.weight", rp)
+    t = time.perf_counter()
+    autop.propagate_annotations(verbose=False, method="fix")
+    t_prop = time.perf_counter() - t
+
 t = time.perf_counter()
 ApproximateShardingSolver(opt).get_solution(verbose=False)
 t_solve = time.perf_counter() - t
 obj = opt.profile["approximate"]["objective"]
 
-print(f"[dp-alone] build={t_build:.2f}s  approx_solve={t_solve:.2f}s  "
-      f"total={t_build + t_solve:.2f}s  obj={obj:.1f}  "
-      f"decision_vars={len(opt.decision_vars)}", flush=True)
+print(f"[{label}] build={t_build:.2f}s  propagate={t_prop:.2f}s  "
+      f"approx_solve={t_solve:.2f}s  total={t_build + t_prop + t_solve:.2f}s  "
+      f"obj={obj:.1f}  decision_vars={len(opt.decision_vars)}", flush=True)
diff --git a/examples/_bench_merge.py b/examples/_bench_merge.py
index e5bc6c5a..c6249021 100644
--- a/examples/_bench_merge.py
+++ b/examples/_bench_merge.py
@@ -76,17 +76,29 @@ def log(msg=""):
 )
 ndim = mesh.ndim
 
-vocab_size = 128256
+# MODEL=1b is the real LLaMA3-1B; MODEL=small is a tractable proxy whose smaller
+# tensors yield few enough decision variables that the exact ILP/LP-bound finish
+# on a 3D mesh (where the 1B PuLP problem has ~8M variables and is impractical),
+# letting us certify the approximate solver's gap on real 3D structure.
+MODEL = os.environ.get("MODEL", "1b")
+vocab_size = 1024 if MODEL == "small" else 128256
 batch_size = 2 * mesh.shape[0]
 seqlen = SEQLEN
 
 
 def model_fn():
-    args = TransformerModelArgs(
-        dim=2048, n_layers=16, n_heads=32, n_kv_heads=8,
-        ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000,
-        vocab_size=vocab_size, max_seq_len=seqlen,
-    )
+    if MODEL == "small":
+        args = TransformerModelArgs(
+            dim=256, n_layers=4, n_heads=8, n_kv_heads=4,
+            multiple_of=64, rope_theta=500000,
+            vocab_size=vocab_size, max_seq_len=seqlen,
+        )
+    else:
+        args = TransformerModelArgs(
+            dim=2048, n_layers=16, n_heads=32, n_kv_heads=8,
+            ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000,
+            vocab_size=vocab_size, max_seq_len=seqlen,
+        )
     if N_LAYERS:
         args.n_layers = N_LAYERS
     with torch.device("meta"):
@@ -123,8 +135,8 @@ def add_constraints(autop):
 set_nccl_topo_config(detect_nccl_topo_config(mesh))
 mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
 
-log(f"=== LLaMA3-1B  mesh={MESH_SHAPE}{mesh_names}  world={world_size}  "
-    f"seqlen={seqlen}  layers={N_LAYERS or 16} ===")
+log(f"=== LLaMA3-{MODEL}  mesh={MESH_SHAPE}{mesh_names}  world={world_size}  "
+    f"seqlen={seqlen}  vocab={vocab_size}  layers={N_LAYERS or '(default)'} ===")
 results = {}  # name -> dict(build, solve, total, obj)
 
 

From 523f3aaaf482544476976607f14962a459ffb6a0 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sat, 30 May 2026 21:14:19 -0700
Subject: [PATCH 14/27] Use HiGHS (scipy.linprog) for the 3D LP-bound
 certificate

CBC's simplex on the 8M-variable 3D LP runs for hours; HiGHS solves it in
minutes. Validated on 2D: HiGHS lower bound (72011.5) matches CBC and the exact
ILP optimum to the decimal. The cert now does one full build -> prune+dp +
merged approx objectives + HiGHS LP lower bound -> certified gaps.

Authored with Claude.
---
 examples/_bench_3d_cert.py | 90 ++++++++++++++++++++++++++++----------
 1 file changed, 68 insertions(+), 22 deletions(-)

diff --git a/examples/_bench_3d_cert.py b/examples/_bench_3d_cert.py
index 0f5bcdc5..956489cb 100644
--- a/examples/_bench_3d_cert.py
+++ b/examples/_bench_3d_cert.py
@@ -1,22 +1,26 @@
 """3D optimality certificate for the merged solver on full LLaMA3-1B.
 
-The 3D ILP has ~8M binary variables; the exact CBC solve is impractical (a 2.6 GB
-MPS file). The LP relaxation, however, is empirically integral for this problem
-(verified on 2D, where it equals the exact optimum), so its objective is a tight
-lower bound on the ILP optimum. This script does ONE full PuLP build, then:
-
-  1. get_lower_bound()  -> LP lower bound (the optimality reference)
-  2. annotate + propagate + ApproximateShardingSolver  -> merged objective
-
-and reports the certified gap = (merged - lb) / lb. Slow (one ~13min build + a
-multi-minute LP solve) but a one-shot 3D certificate. Env: MESH, SEQLEN.
+The 3D ILP has ~8M binary variables; the exact CBC solve (and even CBC's LP
+relaxation) is impractical (a 2.6 GB MPS file; CBC simplex runs for hours). The
+LP relaxation is empirically integral for this problem (verified on 2D, where it
+equals the exact optimum), so its objective is a tight lower bound on the ILP
+optimum. We solve that LP with HiGHS (scipy.optimize.linprog), which handles the
+8M-variable sparse LP in minutes, then compare to the approximate solvers.
+
+One full PuLP build feeds: the HiGHS LP lower bound (optimality reference), and
+the prune+dp / merged approximate objectives. Reports the certified gaps. Env:
+MESH, SEQLEN.
 """
 import logging
 import os
 import time
 from unittest.mock import patch
 
+import numpy as np
+import pulp
+import scipy.sparse as sp
 import torch
+from scipy.optimize import linprog
 from torch.distributed.fsdp import MixedPrecisionPolicy
 from torch.distributed.tensor.placement_types import Replicate, Shard
 from torch.testing._internal.distributed.fake_pg import FakeStore
@@ -65,12 +69,45 @@ def input_fn():
     return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
 
 
+def lp_lower_bound_highs(opt):
+    """Solve the LP relaxation (binaries -> [0,1]) of opt.prob with HiGHS and
+    return its objective: a tight lower bound on the ILP optimum."""
+    variables = opt.prob.variables()
+    idx = {v.name: i for i, v in enumerate(variables)}
+    n = len(variables)
+    c = np.zeros(n)
+    for v, coeff in opt.prob.objective.items():
+        c[idx[v.name]] += coeff
+    rows_eq, cols_eq, data_eq, b_eq = [], [], [], []
+    rows_ub, cols_ub, data_ub, b_ub = [], [], [], []
+    r_eq = r_ub = 0
+    for con in opt.prob.constraints.values():
+        rhs = -con.constant
+        items = list(con.items())
+        if con.sense == pulp.LpConstraintEQ:
+            for v, coeff in items:
+                rows_eq.append(r_eq); cols_eq.append(idx[v.name]); data_eq.append(coeff)
+            b_eq.append(rhs); r_eq += 1
+        else:  # LE: a<=b ; GE: a>=b -> -a<=-b
+            sign = 1.0 if con.sense == pulp.LpConstraintLE else -1.0
+            for v, coeff in items:
+                rows_ub.append(r_ub); cols_ub.append(idx[v.name]); data_ub.append(sign * coeff)
+            b_ub.append(sign * rhs); r_ub += 1
+    A_eq = sp.csr_matrix((data_eq, (rows_eq, cols_eq)), shape=(r_eq, n)) if r_eq else None
+    A_ub = sp.csr_matrix((data_ub, (rows_ub, cols_ub)), shape=(r_ub, n)) if r_ub else None
+    res = linprog(c, A_ub=A_ub, b_ub=(b_ub or None), A_eq=A_eq, b_eq=(b_eq or None),
+                  bounds=(0, 1), method="highs")
+    if not res.success:
+        raise RuntimeError(f"HiGHS LP failed: {res.message}")
+    return res.fun, n, r_eq + r_ub
+
+
 set_nccl_topo_config(detect_nccl_topo_config(mesh))
 mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
 x = (Shard(0),) + (Replicate(),) * (ndim - 1)
 out = (Shard(0), Shard(2)) if ndim == 2 else x
 
-log(f"=== 3D cert: LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===")
+log(f"=== 3D cert (HiGHS): LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===")
 t = time.perf_counter()
 autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
 autop.__enter__()
@@ -78,16 +115,18 @@ def input_fn():
 autop.add_input_constraints([x])
 autop.add_output_constraints([out])
 opt = autop.sharding_optimizer
+opt._set_objective()
+opt._apply_memory_constraint()
 log(f"[build] {time.perf_counter()-t:.1f}s  decision_vars={len(opt.decision_vars)}  "
-    f"pulp_vars={len(opt.pulp_variables)}")
+    f"pulp_vars={len(opt.pulp_variables)}  constraints={len(opt.prob.constraints)}")
 
-# LP-relaxation lower bound = optimality reference (the exact ILP is intractable).
-opt._set_objective()
+# prune+dp (approx, no annotation) on the same problem.
 t = time.perf_counter()
-lb = opt.get_lower_bound(verbose=False).objective
-log(f"[LP-bound] {time.perf_counter()-t:.1f}s  lower_bound={lb:.1f}")
+ApproximateShardingSolver(opt).get_solution(verbose=False)
+prune_dp = opt.profile["approximate"]["objective"]
+log(f"[prune+dp]  approx {time.perf_counter()-t:.1f}s  objective={prune_dp:.1f}")
 
-# Merged solver on the same build: propagate the TP plan, then approx-solve.
+# merged (prune+dp+annotated): propagate the TP plan, then approx-solve.
 cp = (None,) * (ndim - 1) + (Shard(0),)
 rp = (None,) * (ndim - 1) + (Shard(1),)
 for proj in ["wq", "wk", "wv"]:
@@ -100,9 +139,16 @@ def input_fn():
 t = time.perf_counter()
 ApproximateShardingSolver(opt).get_solution(verbose=False)
 merged = opt.profile["approximate"]["objective"]
-log(f"[merged] approx {time.perf_counter()-t:.1f}s  objective={merged:.1f}")
+log(f"[merged]    approx {time.perf_counter()-t:.1f}s  objective={merged:.1f}")
 
-gap = 100 * (merged - lb) / lb
-log(f"\n=== 3D certified gap = {gap:+.2f}%  (merged {merged:.1f} vs LP lower bound "
-    f"{lb:.1f}) ===")
-log(f"acceptance gap<=10%: {abs(gap)<=10}  (<=5%: {abs(gap)<=5})")
+# LP relaxation lower bound via HiGHS = optimality reference.
+t = time.perf_counter()
+lb, nvar, ncon = lp_lower_bound_highs(opt)
+log(f"[LP-bound]  HiGHS {time.perf_counter()-t:.1f}s  lower_bound={lb:.1f}  "
+    f"(vars={nvar} cons={ncon})")
+
+log("")
+for name, obj in [("prune+dp", prune_dp), ("merged", merged)]:
+    gap = 100 * (obj - lb) / lb
+    log(f"=== 3D {name:<9} gap = {gap:+.2f}%  (obj {obj:.1f} vs LP lower bound "
+        f"{lb:.1f})  <=10%: {abs(gap)<=10}  <=5%: {abs(gap)<=5} ===")

From fc434d59038e1c8819b389694466418cd14dac0b Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sun, 31 May 2026 10:53:13 -0700
Subject: [PATCH 15/27] Skip enumeration redistribute-cost computation
 (algorithm-preserving fast build)

Strategy enumeration fills each OpSpec's redistribute_cost via torch's
generate_redistribute_costs (~50% of 3D build time per py-spy), but
_build_decision_vars overwrites every edge with the NCCL-aware
estimate_strategy_comms_cost, and nothing reads the enumeration costs in between
(remove_invalid_configs/keep_unique_configs select on placements/shapes only).
So during build_sharding_metadata we patch torch's _ops.utils.redistribute_cost
to a structure-preserving dummy. Autoparallel's own cost model uses a separate
redistribute_cost and is unaffected. A/B verified byte-identical decision_vars
(dv_hash) and approx objective on tiny + 1B/2D; toggle via AP_FAST_BUILD=0.

Authored with Claude.
---
 autoparallel/optimize_sharding.py | 127 +++++++++++++++++++-----------
 examples/_bench_build_profile.py  |  80 +++++++++++++++++++
 examples/_bench_build_verify.py   |  92 ++++++++++++++++++++++
 3 files changed, 254 insertions(+), 45 deletions(-)
 create mode 100644 examples/_bench_build_profile.py
 create mode 100644 examples/_bench_build_verify.py

diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 8d0de31d..7b648946 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -69,9 +69,11 @@
 runtime cost while satisfying all constraints.
 """
 
+import contextlib
 import logging
 import math
 import operator
+import os
 import tempfile
 import time
 from collections import defaultdict
@@ -107,6 +109,35 @@
 
 logger = logging.getLogger(__name__)
 
+# Strategy enumeration fills each OpSpec's redistribute_cost via torch's
+# generate_redistribute_costs (an expensive per-strategy redistribute-plan
+# computation, the dominant cost of build on large/3D meshes). But
+# _build_decision_vars overwrites every edge with the NCCL-aware
+# estimate_strategy_comms_cost, and nothing reads the enumeration costs in
+# between (remove_invalid_configs / keep_unique_configs select on placements/
+# shapes only). So during enumeration we replace torch's redistribute_cost with
+# a structure-preserving dummy to skip the wasted work; the final decision_vars
+# are byte-identical. Autoparallel's own cost model uses a separate
+# redistribute_cost (collective_runtime_estimation) and is unaffected. Escape
+# hatch for A/B verification: AP_FAST_BUILD=0.
+_FAST_BUILD = os.environ.get("AP_FAST_BUILD", "1") == "1"
+
+
+@contextlib.contextmanager
+def _skip_enumeration_redistribute_cost():
+    if not _FAST_BUILD:
+        yield
+        return
+    import torch.distributed.tensor._ops.utils as _dt_utils
+
+    orig = _dt_utils.redistribute_cost
+    _dt_utils.redistribute_cost = lambda *args, **kwargs: 0.0
+    try:
+        yield
+    finally:
+        _dt_utils.redistribute_cost = orig
+
+
 
 def concretize_symint(val):
     """Concretize a SymInt to a plain int, pass through other values.
@@ -660,52 +691,58 @@ def _normalize_node(self, node):
 
     def build_sharding_metadata(self):
         strats = {}
-        for node in self.graph.nodes:
-            if node.op in ("placeholder", "get_attr"):
-                val = node.meta.get("val")
-                if isinstance(val, torch.Tensor):
-                    strats[node] = _create_all_options(self.mesh, val.shape, tensor=val)
-                elif node.op == "placeholder":
-                    # Non-tensor placeholders (e.g. baked-in booleans/strings):
-                    # keep them in strats with empty-shape replicate options
-                    # so the constraint system can reference them.
-                    strats[node] = _create_all_options(self.mesh, ())
+        # Enumeration's redistribute_cost matrices are overwritten with real
+        # costs in _build_decision_vars, so skip computing them here (see
+        # _skip_enumeration_redistribute_cost).
+        with _skip_enumeration_redistribute_cost():
+            for node in self.graph.nodes:
+                if node.op in ("placeholder", "get_attr"):
+                    val = node.meta.get("val")
+                    if isinstance(val, torch.Tensor):
+                        strats[node] = _create_all_options(
+                            self.mesh, val.shape, tensor=val
+                        )
+                    elif node.op == "placeholder":
+                        # Non-tensor placeholders (e.g. baked-in booleans/strings):
+                        # keep them in strats with empty-shape replicate options
+                        # so the constraint system can reference them.
+                        strats[node] = _create_all_options(self.mesh, ())
+                    else:
+                        # Non-tensor get_attr: GraphModule submodules used by
+                        # HOPs — not added to strats, invisible to the ILP.
+                        # _all_input_nodes filters them.
+                        assert node.op == "get_attr"
+                        assert any(
+                            isinstance(u.target, torch._ops.HigherOrderOperator)
+                            or "local_map" in u.name
+                            for u in node.users
+                        ), f"Non-tensor get_attr {node} is not used by a HOP"
+                elif node.op == "call_function":
+                    if not _produces_tensor(node.meta.get("val")):
+                        # Shape-computation nodes (sym_size, operator.mul, etc.)
+                        # produce scalars, not tensors — skip sharding.
+                        continue
+                    user_strats = tree_map_only(
+                        torch.fx.Node,
+                        lambda x: strats.get(x, x.meta.get("val")),
+                        node.args,
+                    )
+                    user_args = tree_map_only(
+                        torch.fx.Node, lambda x: x.meta.get("val"), node.args
+                    )
+                    user_kwargs = tree_map_only(
+                        torch.fx.Node, lambda x: x.meta.get("val"), node.kwargs
+                    )
+                    strats[node] = get_placement_options_for_node(
+                        self.mesh, node, user_strats, user_args, user_kwargs
+                    )
+                elif node.op == "output":
+                    user_strats = tree_map_only(
+                        torch.fx.Node, lambda x: strats[x], node.args
+                    )
+                    strats[node] = user_strats
                 else:
-                    # Non-tensor get_attr: GraphModule submodules used by
-                    # HOPs — not added to strats, invisible to the ILP.
-                    # _all_input_nodes filters them.
-                    assert node.op == "get_attr"
-                    assert any(
-                        isinstance(u.target, torch._ops.HigherOrderOperator)
-                        or "local_map" in u.name
-                        for u in node.users
-                    ), f"Non-tensor get_attr {node} is not used by a HOP"
-            elif node.op == "call_function":
-                if not _produces_tensor(node.meta.get("val")):
-                    # Shape-computation nodes (sym_size, operator.mul, etc.)
-                    # produce scalars, not tensors — skip sharding.
-                    continue
-                user_strats = tree_map_only(
-                    torch.fx.Node,
-                    lambda x: strats.get(x, x.meta.get("val")),
-                    node.args,
-                )
-                user_args = tree_map_only(
-                    torch.fx.Node, lambda x: x.meta.get("val"), node.args
-                )
-                user_kwargs = tree_map_only(
-                    torch.fx.Node, lambda x: x.meta.get("val"), node.kwargs
-                )
-                strats[node] = get_placement_options_for_node(
-                    self.mesh, node, user_strats, user_args, user_kwargs
-                )
-            elif node.op == "output":
-                user_strats = tree_map_only(
-                    torch.fx.Node, lambda x: strats[x], node.args
-                )
-                strats[node] = user_strats
-            else:
-                raise ValueError(f"Unexpected node op: {node.op}")
+                    raise ValueError(f"Unexpected node op: {node.op}")
         return strats
 
     def create_cluster_links(self, clusters):
diff --git a/examples/_bench_build_profile.py b/examples/_bench_build_profile.py
new file mode 100644
index 00000000..82b31bd6
--- /dev/null
+++ b/examples/_bench_build_profile.py
@@ -0,0 +1,80 @@
+"""Dump the lite-build phase breakdown (tracing vs strategy enumeration vs
+decision-var cost estimation) for LLaMA3-1B on a 3D mesh, to see where the
+~615s build time goes. Env: MESH, SEQLEN."""
+import json
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+
+
+def model_fn():
+    args = TransformerModelArgs(
+        dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5,
+        multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN)
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+print(f"=== build profile: mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===", flush=True)
+
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
+autop.__enter__()
+enter_s = time.perf_counter() - t
+opt = autop.sharding_optimizer
+tm = opt.profile["timings"]
+init = tm.get("init_total_s", 0.0)
+tracing = enter_s - init  # __enter__ = tracing + ShardingOptimizer construction
+
+print(json.dumps({
+    "enter_total_s": round(enter_s, 1),
+    "tracing_s (enter - optimizer_init)": round(tracing, 1),
+    "optimizer_init_total_s": round(init, 1),
+    "  strategy_enumeration_s": round(tm.get("strategy_enumeration_s", 0), 1),
+    "  decision_var_build_s": round(tm.get("decision_var_build_s", 0), 1),
+    "    compute_cost_estimation_s": round(tm.get("compute_cost_estimation_s", 0), 1),
+    "    edge_cost_estimation_s": round(tm.get("edge_cost_estimation_s", 0), 1),
+    "    pulp_var_creation_s (0 in lite)": round(tm.get("pulp_var_creation_s", 0), 1),
+    "  validation_s": round(tm.get("validation_s", 0), 1),
+    "decision_vars": len(opt.decision_vars),
+    "graph_nodes": opt.profile["model"]["graph_nodes"],
+    "strategy_options": opt.profile["strategies"]["strategy_options"],
+    "option_tuples (edges)": opt.profile["strategies"]["option_tuples"],
+}, indent=2), flush=True)
diff --git a/examples/_bench_build_verify.py b/examples/_bench_build_verify.py
new file mode 100644
index 00000000..08fea734
--- /dev/null
+++ b/examples/_bench_build_verify.py
@@ -0,0 +1,92 @@
+"""A/B verify that the fast build (AP_FAST_BUILD=1) produces byte-identical
+decision_vars + approx objective as the baseline (AP_FAST_BUILD=0), and report
+build time. Run the same MESH/MODEL with both env values and diff the dv_hash.
+Env: MESH, SEQLEN, MODEL (tiny|1b)."""
+import hashlib
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MODEL = os.environ.get("MODEL", "tiny")
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "4,2").split(","))
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128 if MODEL == "tiny" else 128256
+batch_size = 2 * mesh.shape[0]
+
+
+def model_fn():
+    if MODEL == "tiny":
+        args = TransformerModelArgs(dim=64, n_layers=2, n_heads=4, n_kv_heads=2,
+                                    vocab_size=vocab_size, multiple_of=32,
+                                    rope_theta=500000, max_seq_len=SEQLEN)
+    else:
+        args = TransformerModelArgs(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8,
+                                    ffn_dim_multiplier=1.5, multiple_of=256,
+                                    rope_theta=500000, vocab_size=vocab_size,
+                                    max_seq_len=SEQLEN)
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+x = (Shard(0),) + (Replicate(),) * (ndim - 1)
+out = (Shard(0), Shard(2)) if ndim == 2 else x
+
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
+autop.__enter__()
+autop.add_parameter_memory_constraint(low=None, high=None)
+autop.add_input_constraints([x])
+autop.add_output_constraints([out])
+build_s = time.perf_counter() - t
+opt = autop.sharding_optimizer
+
+# Canonical, exact dump of every decision var's costs.
+items = []
+for key in sorted(opt.decision_vars.keys()):
+    dv = opt.decision_vars[key]
+    items.append((key, repr(dv.cost), repr(dv.comm_cost), repr(dv.compute_cost),
+                  repr(dv.sharding_transition_cost)))
+dv_hash = hashlib.sha256(repr(items).encode()).hexdigest()
+
+t = time.perf_counter()
+ApproximateShardingSolver(opt).get_solution(verbose=False)
+approx_s = time.perf_counter() - t
+obj = opt.profile["approximate"]["objective"]
+
+print(f"AP_FAST_BUILD={os.environ.get('AP_FAST_BUILD', '1')}  MODEL={MODEL} "
+      f"MESH={MESH_SHAPE}  build={build_s:.2f}s  approx={approx_s:.2f}s  "
+      f"n_dv={len(opt.decision_vars)}  dv_hash={dv_hash[:32]}  "
+      f"approx_obj={obj!r}", flush=True)

From c78555a7b71e01e21ea2c121527a8f017e5ac727 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sun, 31 May 2026 11:16:19 -0700
Subject: [PATCH 16/27] Store cluster_links node-level (drop per-option
 expansion) + DecisionVar slots

create_cluster_links materialized one dict entry per (arg,out,inp) option-tuple
per cluster copy (~120M entries, ~80s, huge memory on 3D), but the mapping is
purely node-level (copy->root, identical option indices) and every consumer
reduced it back to node level. Store cluster_links as {copy_node_idx:
root_node_idx} and reconstruct option keys on demand (_cluster_root_key /
_linked_option_keys / _root_to_copies). Serialization already used the
node-level form on disk. Also @dataclass(slots=True) on DecisionVar (millions of
instances). A/B verified byte-identical decision_vars + objective vs the prior
commit (tiny + 1B/2D); all 50 cluster/serialization/approx/propagation tests pass.

Authored with Claude.
---
 autoparallel/approximate_sharding.py |  27 +++----
 autoparallel/optimize_sharding.py    | 106 +++++++++++++++------------
 autoparallel/serialization.py        |  36 ++++-----
 3 files changed, 86 insertions(+), 83 deletions(-)

diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py
index 146effcf..27b5ca07 100644
--- a/autoparallel/approximate_sharding.py
+++ b/autoparallel/approximate_sharding.py
@@ -275,7 +275,8 @@ def _solve(self, verbose: bool = False):
     # ------------------------------------------------------------------ #
     def _build_problem(self):
         opt = self.opt
-        cluster_linked = {key[0] for key in opt.cluster_links}
+        # cluster_links is node-level: copy node idx -> root node idx.
+        cluster_linked = set(opt.cluster_links)
         self.cost_bearing = [
             opt.node_map[node]
             for node in opt.strats
@@ -283,8 +284,8 @@ def _build_problem(self):
         ]
 
         root_to_copies: dict[int, set] = defaultdict(set)
-        for linked_key, root_key in opt.cluster_links.items():
-            root_to_copies[root_key[0]].add(linked_key[0])
+        for copy_idx, root_idx in opt.cluster_links.items():
+            root_to_copies[root_idx].add(copy_idx)
         self.node_mult = {
             v: 1 + len(root_to_copies.get(v, ())) for v in self.cost_bearing
         }
@@ -456,15 +457,13 @@ def _topology_direct(self):
         )
 
         opt = self.opt
-        cl = opt.cluster_links
+        cl = opt.cluster_links  # node-level: copy node idx -> root node idx
 
         def rootkey(k):
-            return cl.get(k, k)
+            return opt._cluster_root_key(k)
 
-        cluster_linked = {key[0] for key in cl}
-        node_root = {}
-        for lk, rk in cl.items():
-            node_root[lk[0]] = rk[0]
+        cluster_linked = set(cl)
+        node_root = dict(cl)
 
         def nroot(idx):
             return node_root.get(idx, idx)
@@ -638,7 +637,7 @@ def _axis_restrict_from_log(self):
         the pin was applied as a PuLP row ("constraint") or as variable bounds
         ("fix", which leaves no row to parse) and in the lite (no-PuLP) build."""
         opt = self.opt
-        node_root = {lk[0]: rk[0] for lk, rk in opt.cluster_links.items()}
+        node_root = dict(opt.cluster_links)  # node-level: copy idx -> root idx
         restrict: dict[int, set] = {}
         for fname, kwargs in getattr(opt, "_constraint_log", []):
             if fname != "add_node_axis_constraint":
@@ -698,10 +697,8 @@ def _build_groups(self, paired_edges, flow_couplings):
         opt = self.opt
         n = len(opt.nodes)
         uf = _UnionFind(n)
-        # cluster_links has one entry per option-key; collapse to unique
-        # (linked_node, root_node) pairs so the K-scaled loops below run over
-        # hundreds of pairs, not millions of duplicates.
-        cluster_pairs = {(lk[0], rk[0]) for lk, rk in opt.cluster_links.items()}
+        # cluster_links is node-level: (copy node idx, root node idx) pairs.
+        cluster_pairs = set(opt.cluster_links.items())
         for li, ri in cluster_pairs:
             uf.union(li, ri)
         for a, b, _ in paired_edges:
@@ -1321,7 +1318,7 @@ def _write_back(self):
                 selected.append(key)
         opt.selected_keys = list(selected)
         for rk in selected:
-            opt.selected_keys.extend(opt._root_to_linked.get(rk, []))
+            opt.selected_keys.extend(opt._linked_option_keys(rk))
         # Populate prob.objective (when a PuLP problem exists) so callers can also
         # score via pulp.value(prob.objective); the returned value uses the
         # equivalent but cheaper total_objective(). In the lite (no-PuLP) build,
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 7b648946..65a9121c 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -219,10 +219,13 @@ def concretize_gm(gm):
     return concrete_gm, orig_to_concrete, concrete_to_orig
 
 
-@dataclass
+@dataclass(slots=True)
 class DecisionVar:
     """A decision variable in the ILP, representing one (node, arg, output_placement,
-    input_placement) choice with its associated costs and strategy metadata."""
+    input_placement) choice with its associated costs and strategy metadata.
+
+    slots=True: there are millions of these on large/3D meshes, so dropping the
+    per-instance __dict__ materially cuts both build time and memory."""
 
     var: Any  # pulp.LpVariable
     cost: float
@@ -397,7 +400,11 @@ def __init__(
 
         get_placement_options_timer().report()
 
-        self.cluster_links: dict[tuple, tuple] = {}
+        # Node-level: cluster-copy node idx -> root node idx (option indices are
+        # identical between copy and root; resolved on demand via
+        # _cluster_root_key / _linked_option_keys).
+        self.cluster_links: dict[int, int] = {}
+        self._root_to_copies: dict[int, list[int]] = defaultdict(list)
         if self.solver_backend == "dp":
             t0 = time.perf_counter()
             self.solver = DPBasedShardingSolver(self)
@@ -746,28 +753,40 @@ def build_sharding_metadata(self):
         return strats
 
     def create_cluster_links(self, clusters):
-        """Create a mapping between identical optimization nodes to reduce the
-        optimization space. If cluster_links[key1] == key2, the optimization
-        problem uses key2's variable in place of key1."""
+        """Map each cluster-copy node to its root node (node-level). The optimizer
+        reuses the root's decision variable for every copy, and the per-(arg, out,
+        inp) option index is identical between a copy and its root, so we store
+        only the node->node map and reconstruct option keys on demand (see
+        _cluster_root_key / _linked_option_keys). Materializing one dict entry per
+        option-tuple instead costs tens of millions of entries (and seconds of
+        build time) on large/3D meshes."""
         for cluster_group in clusters:
             cluster0 = cluster_group[0]
             for cluster_i in cluster_group[1:]:
                 for n0, ni in zip(cluster0, cluster_i):
-                    idx0 = self.node_map[n0]
-                    idx1 = self.node_map[ni]
-                    options_n0 = list(self.walk_over_options(n0))
-                    options_ni = list(self.walk_over_options(ni))
-                    assert options_n0 == options_ni, (
-                        f"Problem with graph clustering: {n0} and {ni} don't have the same number "
-                        "of input/output placements. Please report a bug"
+                    assert len(self.strats[n0].strategies) == len(
+                        self.strats[ni].strategies
+                    ), (
+                        f"Problem with graph clustering: {n0} and {ni} don't have "
+                        "the same number of strategies. Please report a bug"
                     )
-                    for argi, out_idx, inp_idx in options_n0:
-                        self.cluster_links[(idx1, argi, out_idx, inp_idx)] = (
-                            idx0,
-                            argi,
-                            out_idx,
-                            inp_idx,
-                        )
+                    self.cluster_links[self.node_map[ni]] = self.node_map[n0]
+
+    def _cluster_root_key(self, key):
+        """Resolve an option key to its cluster-root option key, or return it
+        unchanged when the node is not a cluster copy. The (arg, out, inp) indices
+        are identical between a copy and its root."""
+        root_idx = self.cluster_links.get(key[0])
+        return key if root_idx is None else (root_idx, key[1], key[2], key[3])
+
+    def _linked_option_keys(self, root_key):
+        """The option keys on the cluster copies of root_key's node (each a mirror
+        of root_key with the copy's node index). A copy mirrors its root's
+        per-option validity, so callers pass valid root keys only."""
+        copies = self._root_to_copies.get(root_key[0])
+        if not copies:
+            return ()
+        return [(c, root_key[1], root_key[2], root_key[3]) for c in copies]
 
     def _all_input_nodes(self, node):
         """Variant of node.all_input_nodes that preserves duplicate nodes.
@@ -820,7 +839,7 @@ def _create_pulp_variables(self, variable_category=pulp.LpBinary):
                 f"Unsupported variable_category={variable_category!r}; "
                 "expected pulp.LpBinary or pulp.LpContinuous"
             )
-        cluster_linked_node_idxs = {key[0] for key in self.cluster_links}
+        cluster_linked_node_idxs = set(self.cluster_links)
 
         pulp_variables = {}
         for node, _ in self.strats.items():
@@ -854,7 +873,7 @@ def _get_pulp_variable(self, key):
 
         Returns None if the key was pruned (invalid/infinite-cost strategy).
         """
-        root_key = self.cluster_links.get(key, key)
+        root_key = self._cluster_root_key(key)
         return self.pulp_variables.get(root_key)
 
     def _compute_edge_costs(
@@ -909,7 +928,7 @@ def _build_decision_vars(self):
         """
         # Precompute which node indices are cluster-linked so we can
         # copy costs from the root instead of recomputing them.
-        self._cluster_linked_node_idxs = {key[0] for key in self.cluster_links}
+        self._cluster_linked_node_idxs = set(self.cluster_links)
 
         t_compute = 0.0
         t_edge = 0.0
@@ -1002,10 +1021,7 @@ def _build_decision_vars(self):
         # The root pass above updated redistribute_cost in place with
         # edge-computed costs; linked strats need the same values for
         # _compute_solution_cost and other readers.
-        linked_node_to_root_node: dict[int, int] = {}
-        for linked_key, root_key in self.cluster_links.items():
-            linked_node_to_root_node[linked_key[0]] = root_key[0]
-        for linked_node_idx, root_node_idx in linked_node_to_root_node.items():
+        for linked_node_idx, root_node_idx in self.cluster_links.items():
             linked_node = self.nodes[linked_node_idx]
             root_node = self.nodes[root_node_idx]
             linked_op = self.strats[linked_node]
@@ -1018,12 +1034,12 @@ def _build_decision_vars(self):
                 ]
         n_cluster_copied = len(self.cluster_links)
 
-        # Linked keys mirror their root's validity (redistribute_cost is copied
-        # from the root above), so only valid root keys map to linked keys.
-        self._root_to_linked: dict[tuple, list[tuple]] = defaultdict(list)
-        for linked_key, root_key in self.cluster_links.items():
-            if root_key in self._valid_keys:
-                self._root_to_linked[root_key].append(linked_key)
+        # Root node idx -> [copy node idxs]. Option keys are reconstructed on
+        # demand (see _linked_option_keys); a copy mirrors its root's per-option
+        # validity, so no per-option filtering is needed here.
+        self._root_to_copies = defaultdict(list)
+        for copy_idx, root_idx in self.cluster_links.items():
+            self._root_to_copies[root_idx].append(copy_idx)
 
         t_pulp_end = time.perf_counter()
         logger.debug(
@@ -1060,7 +1076,7 @@ def _resolve_decision_var(self, key):
         dv = self.decision_vars.get(key)
         if dv is not None:
             return dv
-        root_key = self.cluster_links[key]
+        root_key = self._cluster_root_key(key)
         root_dv = self.decision_vars[root_key]
         node_idx, argi, out_idx, _ = key
         strategy = self.strats[self.nodes[node_idx]].strategies[out_idx]
@@ -1088,8 +1104,10 @@ def _find_decision_var(self, node_idx, argi, out_idx):
             key = (node_idx, argi, out_idx, inp_idx)
             if key in self.decision_vars:
                 return self._resolve_decision_var(key)
-            root_key = self.cluster_links.get(key)
-            if root_key is not None and root_key in self.decision_vars:
+            if (
+                key[0] in self.cluster_links
+                and self._cluster_root_key(key) in self.decision_vars
+            ):
                 return self._resolve_decision_var(key)
         return None
 
@@ -1105,10 +1123,10 @@ def _collect_vars(self, node, node_idx, argi, group_by, resolve_clusters=False):
         result = {}
         for _, out_idx, inp_idx in self.walk_over_options(node, argi):
             key = (node_idx, argi, out_idx, inp_idx)
-            if key in self.cluster_links:
+            if key[0] in self.cluster_links:
                 if not resolve_clusters:
                     continue
-                var = self.pulp_variables.get(self.cluster_links[key])
+                var = self.pulp_variables.get(self._cluster_root_key(key))
             else:
                 var = self.pulp_variables.get(key)
             if var is None:  # pruned (invalid/infinite-cost) strategy edge
@@ -1389,7 +1407,7 @@ def _set_objective(self):
             return
         terms = []
         for key, dv in self.decision_vars.items():
-            multiplier = 1 + len(self._root_to_linked.get(key, []))
+            multiplier = 1 + len(self._root_to_copies.get(key[0], ()))
             terms.append(dv.var * dv.cost * multiplier)
         self.prob += pulp.lpSum(terms)
 
@@ -1513,7 +1531,7 @@ def _solve(self, verbose=False):
             key for key, dv in self.decision_vars.items() if dv.var.value() == 1
         ]
         for root_key in list(self.selected_keys):
-            self.selected_keys.extend(self._root_to_linked.get(root_key, []))
+            self.selected_keys.extend(self._linked_option_keys(root_key))
 
         if self.prob.status == -1:
             logger.warning(self.get_violated_constraints_log())
@@ -1638,7 +1656,7 @@ def solve_lp_relaxation(self, verbose=False, frac_tol=1e-6, extract=False):
                     if dv.var.value() is not None and dv.var.value() > 0.5
                 ]
                 for root_key in list(self.selected_keys):
-                    self.selected_keys.extend(self._root_to_linked.get(root_key, []))
+                    self.selected_keys.extend(self._linked_option_keys(root_key))
                 solution = self._to_orig_solution(self._extract_and_validate_solution())
         finally:
             for v, cat in zip(variables, original_cats):
@@ -1932,10 +1950,8 @@ def get_json(self):
 
         # Build node-level cluster mapping: linked_node -> root_node
         cluster_roots: dict[torch.fx.Node, torch.fx.Node] = {}
-        for linked_key, root_key in self.cluster_links.items():
-            linked_node = self.nodes[linked_key[0]]
-            root_node = self.nodes[root_key[0]]
-            cluster_roots[linked_node] = root_node
+        for copy_idx, root_idx in self.cluster_links.items():
+            cluster_roots[self.nodes[copy_idx]] = self.nodes[root_idx]
 
         _normalize_cluster_layer(cluster_roots)
 
diff --git a/autoparallel/serialization.py b/autoparallel/serialization.py
index dfd953b1..4c9167d4 100644
--- a/autoparallel/serialization.py
+++ b/autoparallel/serialization.py
@@ -135,7 +135,7 @@ def save_optimizer(opt, path):
     # Re-key strats by node name, saving only root nodes (non-linked).
     # Linked nodes share identical strats with their root and are
     # reconstructed on load from cluster_links.
-    linked_node_names = {opt.nodes[lk[0]].name for lk in opt.cluster_links}
+    linked_node_names = {opt.nodes[c].name for c in opt.cluster_links}
     strats_by_name = {
         node.name: strat
         for node, strat in opt.strats.items()
@@ -193,8 +193,8 @@ def save_optimizer(opt, path):
         "dv_costs_keys": dv_costs_keys,
         "dv_costs_vals": dv_costs_vals,
         "cluster_links_node_by_name": {
-            opt.nodes[lk[0]].name: opt.nodes[rk[0]].name
-            for lk, rk in opt.cluster_links.items()
+            opt.nodes[c].name: opt.nodes[r].name
+            for c, r in opt.cluster_links.items()
         },
         "constraint_log": opt._constraint_log,
         "selected_keys_by_name": selected_keys_by_name,
@@ -272,22 +272,12 @@ def load_optimizer(cls, path):
     opt.build_pulp = True
     opt.profile = {"timings": {}}
 
-    # Reconstruct cluster_links by expanding the node-level mapping over
-    # all (argi, out_idx, inp_idx) combinations.
-    opt.cluster_links = {}
-    for linked_name, root_name in cluster_links_node_by_name.items():
-        linked_node = nodes_by_name[linked_name]
-        root_node = nodes_by_name[root_name]
-        linked_idx = opt.node_map[linked_node]
-        root_idx = opt.node_map[root_node]
-        for argi, out_idx, inp_idx in opt.walk_over_options(linked_node):
-            opt.cluster_links[(linked_idx, argi, out_idx, inp_idx)] = (
-                root_idx,
-                argi,
-                out_idx,
-                inp_idx,
-            )
-    opt._cluster_linked_node_idxs = {key[0] for key in opt.cluster_links}
+    # cluster_links is node-level: copy node idx -> root node idx.
+    opt.cluster_links = {
+        opt.node_map[nodes_by_name[linked_name]]: opt.node_map[nodes_by_name[root_name]]
+        for linked_name, root_name in cluster_links_node_by_name.items()
+    }
+    opt._cluster_linked_node_idxs = set(opt.cluster_links)
 
     # Mesh placeholder — provides shape/dim_names for get_json() and ndim
     # for add_node_constraint() default placement, without needing a PG
@@ -344,9 +334,9 @@ def load_optimizer(cls, path):
         len(opt.decision_vars),
     )
 
-    opt._root_to_linked = defaultdict(list)
-    for linked_key, root_key in opt.cluster_links.items():
-        opt._root_to_linked[root_key].append(linked_key)
+    opt._root_to_copies = defaultdict(list)
+    for copy_idx, root_idx in opt.cluster_links.items():
+        opt._root_to_copies[root_idx].append(copy_idx)
 
     opt.prob = pulp.LpProblem("AutoParallel", pulp.LpMinimize)
     opt.add_default_constraints()
@@ -399,7 +389,7 @@ def _restore_solution(opt, selected_keys_by_name, nodes_by_name):
 
     # Expand cluster links
     for root_key in list(opt.selected_keys):
-        opt.selected_keys.extend(opt._root_to_linked.get(root_key, []))
+        opt.selected_keys.extend(opt._linked_option_keys(root_key))
 
 
 def save_placements(opt, path):

From f493ab85ad70ec364c8cfead4ec572b59506a7ec Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sun, 31 May 2026 11:30:54 -0700
Subject: [PATCH 17/27] Parallelize decision-var cost computation across forked
 workers

decision_var_build (estimate_strategy_comms_cost over millions of edges) is the
last build bottleneck and is per-node independent. Split _build_decision_vars
into Phase A (compute per-edge costs, fork-parallel) + Phase B (assemble
DecisionVars / PuLP vars, serial). Workers read the optimizer from the
fork-inherited address space (no pickling of the mesh / strategy graph) and
return only primitive cost tuples; the deterministic computation makes the
result byte-identical to serial. Workers fork before any PuLP object exists.

Cumulative build result on LLaMA3-1B 3D (2,4,8): 777s -> 62s (12.5x), now
comparable to the ~50s approximate solve. A/B byte-identical (tiny + 1B/2D);
3D end-to-end objective unchanged (50222.7); all 50 build/approx/serialization/
propagation tests pass. Serial fallback via AP_PARALLEL_BUILD=1.

Authored with Claude.
---
 autoparallel/optimize_sharding.py | 146 ++++++++++++++++++++++--------
 1 file changed, 107 insertions(+), 39 deletions(-)

diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 65a9121c..1620be87 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -138,6 +138,61 @@ def _skip_enumeration_redistribute_cost():
         _dt_utils.redistribute_cost = orig
 
 
+# Number of fork workers for the per-edge cost computation in _build_decision_vars
+# (the dominant cost of build on large/3D meshes). 1 = serial (use for A/B
+# verification); default scales with cores. The computation is per-node
+# independent and deterministic, so the parallel result is byte-identical.
+_PARALLEL_BUILD_WORKERS = int(
+    os.environ.get("AP_PARALLEL_BUILD", str(min(32, (os.cpu_count() or 1))))
+)
+
+# Set to the optimizer before forking cost workers; the workers read it from the
+# fork-inherited address space (no pickling of the mesh / strategy graph).
+_FORK_OPT: "ShardingOptimizer | None" = None
+
+
+def _par_node_edge_costs(node_idx):
+    """Worker: compute the per-edge (comm, transition) costs and the per-strategy
+    compute cost for one root node, reading the fork-inherited optimizer. Pure —
+    it reads strats and mutates nothing; the parent assembles DecisionVars from
+    these primitives. Returns (node_idx, out_data) where
+    out_data[out_idx] = (per_arg_compute, arg_rows) and
+    arg_rows[argi][inp_idx] = (comm_cost, transition_cost)."""
+    opt = _FORK_OPT
+    node = opt.nodes[node_idx]
+    op_strategy = opt.strats[node]
+    num_args = len(op_strategy.strategies[0].input_specs)
+    all_input_nodes = opt._all_input_nodes(node)
+    producer_strategies = [opt.strats[n] for n in all_input_nodes]
+    out_data = []
+    for output_strategy in op_strategy.strategies:
+        per_arg_compute = (
+            estimate_strategy_runtime_cost(node, output_strategy) / num_args
+        )
+        arg_rows = []
+        for argi, redist_costs in enumerate(output_strategy.redistribute_cost):
+            producer_strategy = (
+                producer_strategies[argi]
+                if argi < len(producer_strategies)
+                else None
+            )
+            arg_rows.append(
+                [
+                    opt._compute_edge_costs(
+                        node,
+                        output_strategy,
+                        argi,
+                        inp_idx,
+                        default_comm_cost,
+                        producer_strategy,
+                    )
+                    for inp_idx, default_comm_cost in enumerate(redist_costs)
+                ]
+            )
+        out_data.append((per_arg_compute, arg_rows))
+    return node_idx, out_data
+
+
 
 def concretize_symint(val):
     """Concretize a SymInt to a plain int, pass through other values.
@@ -944,47 +999,35 @@ def _build_decision_vars(self):
             (self.node_map[node], node, strat) for node, strat in self.strats.items()
         ]
 
-        # Build DVs for root nodes only (not cluster-linked). Compute the edge
-        # cost first and only materialize a variable when it is finite.
-        for node_idx, node, op_strategy in strats_items:
-            if node.op == "output":
-                continue
-            if node_idx in self._cluster_linked_node_idxs:
-                continue
-
-            num_args = len(op_strategy.strategies[0].input_specs)
-            # Hoisted out of the per-(out_idx, argi, inp_idx) loops: these depend
-            # only on the node, not on the strategy choice. Recomputing them per
-            # decision var was O(#vars) calls to _all_input_nodes (a tree_flatten
-            # each), which dominated build time on large/3D meshes.
-            all_input_nodes = self._all_input_nodes(node)
-            producer_strategies = [self.strats[n] for n in all_input_nodes]
-
-            for out_idx, output_strategy in enumerate(op_strategy.strategies):
-                tc0 = time.perf_counter()
-                compute_cost = estimate_strategy_runtime_cost(node, output_strategy)
-                t_compute += time.perf_counter() - tc0
-                per_arg_compute = compute_cost / num_args
-
-                te0 = time.perf_counter()
+        # Phase A: compute every root node's per-edge costs. This (the comm-cost
+        # estimate over millions of edges) dominates build, is per-node
+        # independent, and mutates nothing, so it runs across forked workers.
+        root_idxs = [
+            node_idx
+            for node_idx, node, _ in strats_items
+            if node.op != "output" and node_idx not in self._cluster_linked_node_idxs
+        ]
+        tc0 = time.perf_counter()
+        node_results = self._compute_node_edge_costs(root_idxs)
+        t_edge = time.perf_counter() - tc0
+
+        # Phase B: assemble decision vars (and PuLP variables) from the computed
+        # costs. Serial because PuLP vars and DecisionVars hold parent-owned
+        # strategy objects; byte-identical to computing the costs inline. This
+        # also writes the real costs back into each strat's redistribute_cost
+        # (overwriting the enumeration dummies) for the cluster batch-copy and
+        # _compute_solution_cost readers below.
+        for node_idx, out_data in node_results:
+            node = self.nodes[node_idx]
+            op_strategy = self.strats[node]
+            for out_idx, (per_arg_compute, arg_rows) in enumerate(out_data):
+                output_strategy = op_strategy.strategies[out_idx]
                 for argi, redist_costs in enumerate(output_strategy.redistribute_cost):
-                    producer_strategy = (
-                        producer_strategies[argi]
-                        if argi < len(producer_strategies)
-                        else None
-                    )
                     input_spec = output_strategy.input_specs[argi]
-                    for inp_idx, default_comm_cost in enumerate(redist_costs):
-                        comm_cost, transition_cost = self._compute_edge_costs(
-                            node,
-                            output_strategy,
-                            argi,
-                            inp_idx,
-                            default_comm_cost,
-                            producer_strategy,
-                        )
+                    for inp_idx, (comm_cost, transition_cost) in enumerate(
+                        arg_rows[argi]
+                    ):
                         redist_costs[inp_idx] = comm_cost
-
                         cost = comm_cost + per_arg_compute + transition_cost
                         # Prune invalid (infinite-cost) edges: no variable, no
                         # DecisionVar. A key absent from decision_vars is treated
@@ -1015,7 +1058,6 @@ def _build_decision_vars(self):
                             input_spec=input_spec,
                         )
                         n_vars += 1
-                t_edge += time.perf_counter() - te0
 
         # Batch-copy redistribute_cost from root strats to linked strats.
         # The root pass above updated redistribute_cost in place with
@@ -1071,6 +1113,32 @@ def _build_decision_vars(self):
         )
         return decision_vars
 
+    def _compute_node_edge_costs(self, root_idxs):
+        """Phase A of _build_decision_vars: per-root-node edge costs. Parallel
+        across forked workers when enabled; workers read this optimizer from the
+        fork-inherited address space (no pickling of the mesh / strategy graph)
+        and return only primitive cost tuples. The computation is deterministic,
+        so the parallel result is byte-identical to the serial path."""
+        global _FORK_OPT
+        _FORK_OPT = self
+        try:
+            if _PARALLEL_BUILD_WORKERS <= 1 or len(root_idxs) < 64:
+                return [_par_node_edge_costs(ni) for ni in root_idxs]
+            import multiprocessing as mp
+
+            ctx = mp.get_context("fork")
+            with ctx.Pool(_PARALLEL_BUILD_WORKERS) as pool:
+                # imap (ordered), not imap_unordered: results come back in
+                # root_idxs order so decision_vars is assembled in the same node
+                # order as the serial path. This keeps the PuLP objective's
+                # lpSum term order identical too, so even the ILP path is
+                # bit-for-bit unchanged (float addition is not associative).
+                return list(
+                    pool.imap(_par_node_edge_costs, root_idxs, chunksize=4)
+                )
+        finally:
+            _FORK_OPT = None
+
     def _resolve_decision_var(self, key):
         """Return a DecisionVar for key, reconstructing on the fly for linked keys."""
         dv = self.decision_vars.get(key)

From 496e7b33dc5f1c74958b4cdc2f676527c579c304 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sun, 31 May 2026 15:53:42 -0700
Subject: [PATCH 18/27] Add cross-size prune+dp benchmark (latency +
 LP-relaxation accuracy)

examples/_bench_sizes.py runs the prune+dp approximate search across LLaMA3
1B/3B/8B/70B on a configurable mesh, reporting end-to-end latency (lite build +
approx solve) and an accuracy reference: the gap of the approximate objective
against a HiGHS LP-relaxation lower bound (the sharding LP is integral, so the
bound equals the exact ILP optimum). Controlled via MODEL/MESH/SEQLEN/ACCURACY/
LP_METHOD env vars.

Authored with Claude.
---
 examples/_bench_sizes.py | 166 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 examples/_bench_sizes.py

diff --git a/examples/_bench_sizes.py b/examples/_bench_sizes.py
new file mode 100644
index 00000000..46962209
--- /dev/null
+++ b/examples/_bench_sizes.py
@@ -0,0 +1,166 @@
+"""e2e prune+dp (approx) search across LLaMA3 sizes: latency + accuracy.
+
+For one MODEL on one MESH:
+  * latency: lite build (build_pulp=False) + ApproximateShardingSolver -> the
+    production prune+dp path (build_s, approx_s, total, objective).
+  * accuracy: a separate full PuLP build -> HiGHS LP-relaxation lower bound
+    (this sharding LP is integral, so the bound equals the exact ILP optimum);
+    gap = (approx_obj - lb) / lb.
+
+Env: MODEL (1b|3b|8b|70b), MESH (e.g. 2,4,8), SEQLEN. One model per process.
+"""
+import gc
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import numpy as np
+import pulp
+import scipy.sparse as sp
+import torch
+from scipy.optimize import linprog
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+MODEL = os.environ.get("MODEL", "1b")
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+
+_CFG = {
+    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
+    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
+    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
+    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
+}
+
+
+def model_fn():
+    args = TransformerModelArgs(
+        rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL]
+    )
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+def constrain(autop):
+    x = (Shard(0),) + (Replicate(),) * (ndim - 1)
+    out = (Shard(0), Shard(2)) if ndim == 2 else x
+    autop.add_parameter_memory_constraint(low=None, high=None)
+    autop.add_input_constraints([x])
+    autop.add_output_constraints([out])
+
+
+def lp_lower_bound_highs(opt):
+    """LP relaxation (binaries -> [0,1]) of the built problem, solved with HiGHS.
+    Objective is read from decision_vars and constraints from prob.constraints
+    using id()-keyed indexing (avoids hashing the long PuLP var names)."""
+    opt._set_objective()
+    opt._apply_memory_constraint()
+    variables = opt.prob.variables()
+    vidx = {id(v): i for i, v in enumerate(variables)}
+    n = len(variables)
+    c = np.zeros(n)
+    for key, dv in opt.decision_vars.items():
+        mult = 1 + len(opt._root_to_copies.get(key[0], ()))
+        c[vidx[id(dv.var)]] += dv.cost * mult
+    re = ru = 0
+    reqr, reqc, reqd, beq = [], [], [], []
+    rubr, rubc, rubd, bub = [], [], [], []
+    for con in opt.prob.constraints.values():
+        rhs = -con.constant
+        if con.sense == pulp.LpConstraintEQ:
+            for v, co in con.items():
+                reqr.append(re); reqc.append(vidx[id(v)]); reqd.append(co)
+            beq.append(rhs); re += 1
+        else:
+            s = 1.0 if con.sense == pulp.LpConstraintLE else -1.0
+            for v, co in con.items():
+                rubr.append(ru); rubc.append(vidx[id(v)]); rubd.append(s * co)
+            bub.append(s * rhs); ru += 1
+    A_eq = sp.csr_matrix((reqd, (reqr, reqc)), shape=(re, n)) if re else None
+    A_ub = sp.csr_matrix((rubd, (rubr, rubc)), shape=(ru, n)) if ru else None
+    # Dual simplex: far faster than the barrier (IPM) on this near-integral,
+    # network-flow-like LP. We only need the optimal objective as the bound.
+    method = os.environ.get("LP_METHOD", "highs-ds")
+    res = linprog(c, A_ub=A_ub, b_ub=(bub or None), A_eq=A_eq, b_eq=(beq or None),
+                  bounds=(0, 1), method=method, options={"disp": True})
+    if not res.success:
+        raise RuntimeError(f"HiGHS failed: {res.message}")
+    return res.fun, n, re + ru
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+print(f"### MODEL={MODEL} mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ###", flush=True)
+
+# ---- latency: lite build + prune+dp approx (production path) ----
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
+autop.__enter__()
+constrain(autop)
+build_lite = time.perf_counter() - t
+opt = autop.sharding_optimizer
+n_dv = len(opt.decision_vars)
+params = opt.profile["model"]["parameter_numel"]
+t = time.perf_counter()
+ApproximateShardingSolver(opt).get_solution(verbose=False)
+approx_s = time.perf_counter() - t
+obj = opt.profile["approximate"]["objective"]
+print(f"[latency] params={params/1e9:.2f}B  lite_build={build_lite:.1f}s  "
+      f"approx={approx_s:.1f}s  total={build_lite + approx_s:.1f}s  "
+      f"decision_vars={n_dv}  obj={obj:.1f}", flush=True)
+autop.__exit__(None, None, None)
+del autop, opt
+gc.collect()
+
+if os.environ.get("ACCURACY", "1") != "1":
+    print(f"[RESULT] MODEL={MODEL} params={params/1e9:.2f}B  "
+          f"prune+dp: build={build_lite:.1f}s approx={approx_s:.1f}s "
+          f"total={build_lite+approx_s:.1f}s  obj={obj:.1f}  (LP skipped)", flush=True)
+    raise SystemExit(0)
+
+# ---- accuracy: full build + HiGHS LP lower bound ----
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
+autop.__enter__()
+constrain(autop)
+full_build = time.perf_counter() - t
+opt = autop.sharding_optimizer
+t = time.perf_counter()
+lb, nvar, ncon = lp_lower_bound_highs(opt)
+lp_s = time.perf_counter() - t
+gap = 100 * (obj - lb) / lb
+print(f"[accuracy] full_build={full_build:.1f}s  lp_solve={lp_s:.1f}s  "
+      f"lower_bound={lb:.1f}  vars={nvar} cons={ncon}", flush=True)
+print(f"[RESULT] MODEL={MODEL} params={params/1e9:.2f}B  "
+      f"prune+dp: build={build_lite:.1f}s approx={approx_s:.1f}s total={build_lite+approx_s:.1f}s  "
+      f"obj={obj:.1f}  LP_lb={lb:.1f}  gap={gap:+.2f}%", flush=True)

From 2f06359c3d0e51c3092adc996122a829bfb32c2e Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sun, 31 May 2026 21:18:37 -0700
Subject: [PATCH 19/27] Approx solver: replace loopy min-sum BP with TRW-S;
 skip-clustered build work

The approximate solver's min-sum belief propagation settled into globally
inconsistent fixed points on the sharding MRF (the undirected factor graph is
loopy: residual and multi-branch reconvergence give ~129 cycles after
clustering), leaving the objective 5-16% above the optimum on 2D and up to 12%
on 3D. The factor graph and objective are faithful and the optimum is
representable (verified against an exact CBC solve on 2D and an integral LP on
3D), so this was purely a solver failure.

_belief_propagation now runs tree-reweighted sequential message passing (TRW-S):
a node ordering induces monotonic chains, each node is reweighted by
1/max(in,out)-degree, and forward/backward half-sweeps send min-sum messages
only along the pass direction. On this integral problem TRW-S converges to the
exact MAP: the bare approx (no annotation) drops to +0.00% on 2D (1B/3B/8B/70B,
matching CBC) and +0.08-0.82% on 3D, ~20-30x faster than solving the LP. The
decoded energy converges in long irregular plateaus, so a fixed sweep budget
(time-bounded) is used rather than an early-stop heuristic; the now-dominated
greedy second candidate is dropped.

Two algorithm-preserving build speedups also land here: validate() skips
cluster-copy nodes (the root covers them), and graph clustering memoizes each
node's op-strategy string instead of rebuilding it per consumer.

Authored with Claude.
---
 autoparallel/approximate_sharding.py          | 127 +++++++++---------
 autoparallel/graph_passes/graph_clustering.py |  18 +--
 autoparallel/optimize_sharding.py             |   5 +
 3 files changed, 79 insertions(+), 71 deletions(-)

diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py
index 27b5ca07..5361a8a0 100644
--- a/autoparallel/approximate_sharding.py
+++ b/autoparallel/approximate_sharding.py
@@ -121,8 +121,7 @@ def __init__(
         self,
         optimizer,
         candidate_limit: Optional[int] = 64,
-        bp_iters: int = 20,
-        bp_damping: float = 0.2,
+        bp_iters: int = 400,
         bp_tol: float = 1e-3,
         max_sweeps: int = 12,
         max_time_s: float = 60.0,
@@ -133,7 +132,6 @@ def __init__(
         self.opt = optimizer
         self.candidate_limit = candidate_limit
         self.bp_iters = bp_iters
-        self.bp_damping = bp_damping
         self.bp_tol = bp_tol
         self.max_sweeps = max_sweeps
         self.max_time_s = max_time_s
@@ -191,12 +189,15 @@ def _solve(self, verbose: bool = False):
             )
 
         deadline = t0 + self.max_time_s
-        # Candidate 1: belief propagation init.
+        # TRW-S init, then local-search polish. TRW-S reaches the exact MAP on the
+        # (integral) sharding problem, so the old greedy second candidate it used
+        # to be compared against is strictly dominated and has been dropped; the
+        # polish remains for the memory budget and as a local-search safety net.
         t_bp0 = time.perf_counter()
-        self._belief_propagation()
+        self._belief_propagation(deadline)
         if verbose:
-            logger.info("approx phase: bp converged iter=%s delta=%.4g in %.2fs; "
-                        "bp_decode energy=%.1f",
+            logger.info("approx phase: trws iter=%s delta=%.4g in %.2fs; "
+                        "decode energy=%.1f",
                         getattr(self, "_bp_last_iter", None),
                         getattr(self, "_bp_last_delta", float("nan")),
                         time.perf_counter() - t_bp0,
@@ -204,25 +205,11 @@ def _solve(self, verbose: bool = False):
         self._memory_repair()
         self._coordinate_descent(deadline)
         if verbose:
-            logger.info("approx phase: bp+cd energy=%.1f", self._fast_total_energy())
+            logger.info("approx phase: trws+cd energy=%.1f", self._fast_total_energy())
         self._star_block_search(deadline)
         bp_energy = self._fast_total_energy()
-        bp_snapshot = [g.current for g in self.groups]
         if verbose:
-            logger.info("approx phase: bp+cd+star energy=%.1f", bp_energy)
-
-        # Candidate 2: greedy init (cheap insurance against BP doing poorly).
-        self._greedy_init()
-        self._memory_repair()
-        self._coordinate_descent(deadline)
-        self._star_block_search(deadline)
-        greedy_energy = self._fast_total_energy()
-        if verbose:
-            logger.info("approx phase: greedy+cd+star energy=%.1f", greedy_energy)
-
-        if bp_energy <= greedy_energy:
-            for gid, ci in enumerate(bp_snapshot):
-                self._set_group(gid, ci)
+            logger.info("approx phase: trws+cd+star energy=%.1f", bp_energy)
         t_solve = time.perf_counter() - t0 - t_build
 
         objective = self._write_back()
@@ -240,12 +227,11 @@ def _solve(self, verbose: bool = False):
         )
         logger.info(
             "ApproximateShardingSolver: status=%s objective=%.4f "
-            "(bp=%.1f greedy=%.1f) groups=%d nodes=%d "
+            "(trws+polish=%.1f) groups=%d nodes=%d "
             "timings={build=%.3fs,solve=%.3fs,total=%.3fs}",
             status,
             objective,
             bp_energy,
-            greedy_energy,
             len(self.groups),
             len(self.cost_bearing),
             t_build,
@@ -260,7 +246,6 @@ def _solve(self, verbose: bool = False):
             "total_s": total_s,
             "groups": len(self.groups),
             "bp_energy": bp_energy,
-            "greedy_energy": greedy_energy,
         }
         if infeasible:
             raise RuntimeError(
@@ -1028,46 +1013,80 @@ def _fast_total_energy(self):
     # ------------------------------------------------------------------ #
     # Belief propagation (min-sum) + decode
     # ------------------------------------------------------------------ #
-    def _belief_propagation(self):
-        """Sequential (forward-backward, topological) min-sum message passing.
-        Exact MAP on trees in one sweep; near-optimal on the near-tree transformer
-        graph in a few sweeps, far better than synchronous flooding."""
+    def _belief_propagation(self, deadline=None):
+        """Sequential tree-reweighted message passing (TRW-S).
+
+        Plain loopy min-sum BP settles into globally-inconsistent fixed points on
+        this MRF (empirically 5-16% above the optimum). TRW-S optimizes a convex
+        upper bound over a tree decomposition (here: monotonic chains induced by a
+        node ordering), so on the integral sharding problem it converges to the
+        exact MAP. Node g is reweighted by 1/(chains through g) = 1/max(in,out)deg
+        under the ordering; forward and backward half-sweeps send only along edges
+        oriented with the pass. We decode each sweep and keep the best assignment."""
         G = len(self.groups)
+        if G == 0:
+            return
         unary = self.g_unary
         nbrs = self.nbrs
-        damp = self.bp_damping
 
         order = sorted(range(G), key=lambda g: min(self.groups[g].members))
+        pos = [0] * G
+        for i, g in enumerate(order):
+            pos[g] = i
+        gamma = np.ones(G)
+        for g in range(G):
+            indeg = sum(1 for h in nbrs[g] if pos[h] < pos[g])
+            outdeg = sum(1 for h in nbrs[g] if pos[h] > pos[g])
+            gamma[g] = 1.0 / max(indeg, outdeg, 1)
+
         msg: dict[tuple, np.ndarray] = {}
         for g in range(G):
             for h in nbrs[g]:
                 msg[(g, h)] = np.zeros(len(unary[h]))
 
+        # We decode every sweep and keep the best assignment. The decoded energy
+        # converges in long, irregular plateaus (it can sit at a high value for
+        # ~100 sweeps, drop, plateau again, then drop to the optimum), so neither
+        # an energy-plateau counter nor a message-delta threshold detects true
+        # convergence without stopping on a false plateau. We therefore run a
+        # fixed sweep budget (bounded by the time deadline), which is enough for
+        # the slowest converger observed, and an exact fixed point ends early.
+        best_e = INF
+        best_snap = None
         for sweep in range(self.bp_iters):
             max_delta = 0.0
-            for direction in (order, order[::-1]):
-                for g in direction:
+            for forward in (True, False):
+                for g in order if forward else order[::-1]:
                     if not nbrs[g]:
                         continue
-                    in_sum = unary[g].copy()
-                    for k in nbrs[g]:
-                        in_sum += msg[(k, g)]
+                    wp = unary[g].copy()
+                    for r in nbrs[g]:
+                        wp += msg[(r, g)]
+                    wp *= gamma[g]
                     for h in nbrs[g]:
-                        excl = in_sum - msg[(h, g)]
+                        if (pos[h] > pos[g]) != forward:
+                            continue
                         P = self._pair_matrix(g, h)  # (D_g, D_h)
-                        m = (excl[:, None] + P).min(axis=0)
+                        m = ((wp - msg[(h, g)])[:, None] + P).min(axis=0)
                         m -= m.min()
-                        md = (1 - damp) * m + damp * msg[(g, h)]
-                        delta = np.abs(md - msg[(g, h)]).max()
-                        if delta > max_delta:
-                            max_delta = delta
-                        msg[(g, h)] = md
+                        d = np.abs(m - msg[(g, h)]).max()
+                        if d > max_delta:
+                            max_delta = d
+                        msg[(g, h)] = m
+            self._decode(msg)
+            e = self._fast_total_energy()
+            if e < best_e:
+                best_e, best_snap = e, [grp.current for grp in self.groups]
             self._bp_last_iter = sweep + 1
             self._bp_last_delta = max_delta
-            if max_delta < self.bp_tol:
+            if max_delta == 0.0 or (
+                deadline is not None and time.perf_counter() > deadline
+            ):
                 break
 
-        self._decode(msg)
+        if best_snap is not None:
+            for gid, ci in enumerate(best_snap):
+                self._set_group(gid, ci)
 
     def _decode(self, msg):
         """Sequential topological decode: fix each group to the argmin of its
@@ -1097,24 +1116,6 @@ def _set_group(self, gid, ci):
         for m, o in group.choices[ci].items():
             self.cur_out[m] = o
 
-    def _greedy_init(self):
-        order = sorted(range(len(self.groups)),
-                       key=lambda g: min(self.groups[g].members))
-        for gid in order:
-            self._set_group(gid, 0)
-        for gid in order:
-            best_i, best_e = 0, INF
-            for ci in range(self.groups[gid].domain):
-                e = self.g_unary[gid][ci]
-                for h in self.nbrs[gid]:
-                    if min(self.groups[h].members) < min(self.groups[gid].members):
-                        ch = self.groups[h].current
-                        e += (self.C[(gid, h)][ci, ch] if gid < h
-                              else self.C[(h, gid)][ch, ci])
-                if e < best_e:
-                    best_i, best_e = ci, e
-            self._set_group(gid, best_i)
-
     def _coordinate_descent(self, deadline):
         for _ in range(self.max_sweeps):
             if time.perf_counter() > deadline:
diff --git a/autoparallel/graph_passes/graph_clustering.py b/autoparallel/graph_passes/graph_clustering.py
index c01a09a3..a8efafea 100644
--- a/autoparallel/graph_passes/graph_clustering.py
+++ b/autoparallel/graph_passes/graph_clustering.py
@@ -65,18 +65,17 @@ def _prepare_op_strategy(op_strategy):
     return str(op_strategy)
 
 
-def _hash_node(node, strategies, input_pickler):
+def _hash_node(node, strategies, input_pickler, op_str):
+    # op_str caches _prepare_op_strategy(strategies[n]) per node: each node's
+    # (large, 3D-mesh) strategy string is otherwise rebuilt once as self plus
+    # once per consumer, dominating clustering time on deep models.
     key = (
         str(node.target),
         node.meta.get("partitioner_tag"),
         node.meta.get("stack_trace"),
         _normalize_args(node),
-        _prepare_op_strategy(strategies[node]),
-        tuple(
-            _prepare_op_strategy(strategies[s])
-            for s in node.all_input_nodes
-            if s in strategies
-        ),
+        op_str[node],
+        tuple(op_str[s] for s in node.all_input_nodes if s in strategies),
     )
     return sha256_hash(input_pickler.dumps(key))
 
@@ -107,6 +106,7 @@ def get_identical_regions(
     hash_to_duplicates: dict[str, IdenticalNodes] = defaultdict(list)
     node_to_duplicates: dict[Node, IdenticalNodes] = {}
     t = time.time()
+    op_str = {n: _prepare_op_strategy(s) for n, s in strategies.items()}
     for node in graph.nodes:
         if node.op == "placeholder":
             continue
@@ -115,7 +115,9 @@ def get_identical_regions(
             # HOP submodule get_attr nodes are not in strategies.
             continue
 
-        duplicates = hash_to_duplicates[_hash_node(node, strategies, input_pickler)]
+        duplicates = hash_to_duplicates[
+            _hash_node(node, strategies, input_pickler, op_str)
+        ]
         duplicates.append(node)
         node_to_duplicates[node] = duplicates
     logger.debug(f"Hashed nodes in {time.time() - t} s")
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 1620be87..9e73889e 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -1209,6 +1209,11 @@ def validate(self):
                 continue
             if node not in self.strats:
                 continue
+            # Cluster copies are structurally identical to their root (same
+            # strategies and input structure, asserted in create_cluster_links),
+            # so validating the root covers them.
+            if self.node_map[node] in self.cluster_links:
+                continue
             strat = self.strats[node]
             strat0 = strat.strategies[0]
             all_input_nodes = self._all_input_nodes(node)

From 2ce86ea934cfbfd30ac0f51d28f89c91b6da6c09 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sun, 31 May 2026 21:18:43 -0700
Subject: [PATCH 20/27] Add approx-solver diagnostic + accuracy benchmarks

Helper scripts used to diagnose and validate the TRW-S fix: factor-graph
faithfulness/representability check, LP integrality check, hyperparameter and
iterated-local-search sweeps, a standalone TRW-S prototype, an annotation
ablation, and a per-phase build profiler.

Authored with Claude.
---
 examples/_bench_anno.py           | 116 ++++++++++++++++++++
 examples/_bench_approx_diag.py    | 173 ++++++++++++++++++++++++++++++
 examples/_bench_approx_ils.py     | 136 +++++++++++++++++++++++
 examples/_bench_approx_sweep.py   | 106 ++++++++++++++++++
 examples/_bench_build_profile.py  |  19 +++-
 examples/_bench_lp_integrality.py | 118 ++++++++++++++++++++
 examples/_bench_trws.py           | 173 ++++++++++++++++++++++++++++++
 7 files changed, 838 insertions(+), 3 deletions(-)
 create mode 100644 examples/_bench_anno.py
 create mode 100644 examples/_bench_approx_diag.py
 create mode 100644 examples/_bench_approx_ils.py
 create mode 100644 examples/_bench_approx_sweep.py
 create mode 100644 examples/_bench_lp_integrality.py
 create mode 100644 examples/_bench_trws.py

diff --git a/examples/_bench_anno.py b/examples/_bench_anno.py
new file mode 100644
index 00000000..45e546ff
--- /dev/null
+++ b/examples/_bench_anno.py
@@ -0,0 +1,116 @@
+"""prune+dp+annotation (the full joint config) vs prune+dp alone, compared to a
+known optimum/LP lower bound. Lite build + optional TP-plan annotation + approx.
+Env: MODEL, MESH, SEQLEN, LP_LB."""
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+MODEL = os.environ.get("MODEL", "70b")
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
+LP_LB = float(os.environ.get("LP_LB", "0"))
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+_CFG = {
+    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
+    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
+    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
+    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
+}
+
+
+def model_fn():
+    args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size,
+                                max_seq_len=SEQLEN, **_CFG[MODEL])
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+COLUMN_PARALLEL = (None,) * (ndim - 1) + (Shard(0),)
+ROW_PARALLEL = (None,) * (ndim - 1) + (Shard(1),)
+
+
+def annotate_tp_plan(autop):
+    for proj in ["wq", "wk", "wv"]:
+        autop.annotate_parameter(f"layers.*.attention.{proj}.weight", COLUMN_PARALLEL)
+    autop.annotate_parameter("layers.*.attention.wo.weight", ROW_PARALLEL)
+    for proj in ["w1", "w3"]:
+        autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", COLUMN_PARALLEL)
+    autop.annotate_parameter("layers.*.feed_forward.w2.weight", ROW_PARALLEL)
+
+
+def constrain(autop):
+    x = (Shard(0),) + (Replicate(),) * (ndim - 1)
+    out = (Shard(0), Shard(2)) if ndim == 2 else x
+    autop.add_parameter_memory_constraint(low=None, high=None)
+    autop.add_input_constraints([x])
+    autop.add_output_constraints([out])
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+print(f"### anno MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ###", flush=True)
+
+
+def gap(o):
+    return 100 * (o - LP_LB) / LP_LB if LP_LB else float("nan")
+
+
+# prune+dp (no annotation)
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
+autop.__enter__()
+constrain(autop)
+build_s = time.perf_counter() - t
+opt = autop.sharding_optimizer
+t = time.perf_counter()
+ApproximateShardingSolver(opt).get_solution(verbose=False)
+dp_s = time.perf_counter() - t
+obj_dp = opt.profile["approximate"]["objective"]
+print(f"[dp]     build={build_s:.1f}s approx={dp_s:.1f}s obj={obj_dp:.1f} gap={gap(obj_dp):+.2f}%", flush=True)
+
+# + annotation
+t = time.perf_counter()
+annotate_tp_plan(autop)
+prop = autop.propagate_annotations(verbose=False, method="fix")
+prop_s = time.perf_counter() - t
+t = time.perf_counter()
+ApproximateShardingSolver(opt).get_solution(verbose=False)
+ann_s = time.perf_counter() - t
+obj_ann = opt.profile["approximate"]["objective"]
+print(f"[dp+anno] build={build_s:.1f}s propagate={prop_s:.1f}s approx={ann_s:.1f}s "
+      f"total={build_s+prop_s+ann_s:.1f}s obj={obj_ann:.1f} gap={gap(obj_ann):+.2f}% "
+      f"(pinned {prop.nodes_determined} nodes)", flush=True)
+print(f"[RESULT] MODEL={MODEL} mesh={MESH_SHAPE} dp_gap={gap(obj_dp):+.2f}% "
+      f"dp+anno_gap={gap(obj_ann):+.2f}% dp+anno_total={build_s+prop_s+ann_s:.1f}s", flush=True)
diff --git a/examples/_bench_approx_diag.py b/examples/_bench_approx_diag.py
new file mode 100644
index 00000000..25de4d85
--- /dev/null
+++ b/examples/_bench_approx_diag.py
@@ -0,0 +1,173 @@
+"""Diagnose the bare approx gap: is the factor graph FAITHFUL (scores the true
+optimum correctly -> solver is at fault) or UNFAITHFUL (drops cost -> model is at
+fault), and is the optimum REPRESENTABLE in the group choices (pruning)?
+
+Builds the ILP, solves it exactly with CBC, then checks whether the approx's own
+machinery (total_objective + factor graph) reproduces the CBC optimum, and where
+the approx's own solution differs. Env: MODEL, MESH, SEQLEN."""
+import logging
+import os
+import time
+from collections import defaultdict
+from unittest.mock import patch
+
+import pulp
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+MODEL = os.environ.get("MODEL", "1b")
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+_CFG = {
+    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
+    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
+    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
+    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
+}
+
+
+def model_fn():
+    args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size,
+                                max_seq_len=SEQLEN, **_CFG[MODEL])
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+def constrain(autop):
+    x = (Shard(0),) + (Replicate(),) * (ndim - 1)
+    out = (Shard(0), Shard(2)) if ndim == 2 else x
+    autop.add_parameter_memory_constraint(low=None, high=None)
+    autop.add_input_constraints([x])
+    autop.add_output_constraints([out])
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+print(f"### diag MODEL={MODEL} mesh={MESH_SHAPE}{names} ###", flush=True)
+
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
+autop.__enter__()
+constrain(autop)
+opt = autop.sharding_optimizer
+print(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True)
+
+opt._set_objective()
+opt._apply_memory_constraint()
+t = time.perf_counter()
+opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, options=["preprocess off"]))
+obj_cbc = pulp.value(opt.prob.objective)
+print(f"[cbc] solve={time.perf_counter()-t:.1f}s obj={obj_cbc:.1f} status={pulp.LpStatus[opt.prob.status]}", flush=True)
+
+# CBC per-(root)node chosen out_idx
+cbc_out = {}
+for key, var in opt.pulp_variables.items():
+    v = var.varValue
+    if v is not None and v > 0.5:
+        cbc_out[key[0]] = key[2]
+
+approx = ApproximateShardingSolver(opt)
+approx._build_problem()
+approx._build_factors()
+
+# (A) FAITHFULNESS: exact objective of the CBC solution via the approx machinery.
+approx.cur_out = dict(cbc_out)
+e_cbc_total = approx.total_objective()
+print(f"[faithful] approx.total_objective(CBC soln) = {e_cbc_total:.1f}  "
+      f"(CBC obj {obj_cbc:.1f}; match={abs(e_cbc_total-obj_cbc)<1.0})", flush=True)
+
+# (B) REPRESENTABILITY: can the group choices express the CBC solution?
+cbc_full = dict(cbc_out)
+for copy_idx, root_idx in opt.cluster_links.items():
+    if root_idx in cbc_out:
+        cbc_full[copy_idx] = cbc_out[root_idx]
+unrep = []
+cbc_group_choice = {}
+for gid, g in enumerate(approx.groups):
+    found = None
+    for ci, choice in enumerate(g.choices):
+        if all(cbc_full.get(m) == o for m, o in choice.items()):
+            found = ci
+            break
+    if found is None:
+        unrep.append(gid)
+    else:
+        cbc_group_choice[gid] = found
+print(f"[representable] groups={len(approx.groups)} "
+      f"with_no_matching_choice={len(unrep)}", flush=True)
+
+# (C) factor-graph energy of the CBC solution (if representable)
+if not unrep:
+    for gid, ci in cbc_group_choice.items():
+        approx._set_group(gid, ci)
+    fge = approx._fast_total_energy()
+    print(f"[fg-energy] _fast_total_energy(CBC soln) = {fge:.1f} "
+          f"(match CBC {abs(fge-obj_cbc)<1.0})", flush=True)
+
+# (D) run the normal approx, localize where it differs from CBC
+approx2 = ApproximateShardingSolver(opt)
+approx2.get_solution(verbose=False)
+obj_approx = opt.profile["approximate"]["objective"]
+ax_out = dict(approx2.cur_out)
+print(f"[approx] obj={obj_approx:.1f} gap={100*(obj_approx-obj_cbc)/obj_cbc:+.2f}%", flush=True)
+
+# per-node exact cost under each assignment (cost_bearing nodes), to localize gap
+def node_cost(solver, out_map, v):
+    o = out_map[v]
+    node = opt.nodes[v]
+    strat = opt.strats[node].strategies[o]
+    prod = solver._arg_prod.get(v, {})
+    c = 0.0
+    for argi in range(len(strat.redistribute_cost)):
+        p = prod.get(argi)
+        inp = out_map[p] if (p is not None and p in out_map) else 0
+        key = (v, argi, o, inp)
+        dv = opt.decision_vars.get(key)
+        if dv is None:
+            return None
+        c += dv.cost
+    return solver.node_mult[v] * c
+
+diffs = []
+for v in approx2.cost_bearing:
+    if cbc_out.get(v) != ax_out.get(v):
+        c_cbc = node_cost(approx2, cbc_out, v)
+        c_ax = node_cost(approx2, ax_out, v)
+        if c_cbc is not None and c_ax is not None:
+            diffs.append((c_ax - c_cbc, v, opt.nodes[v].name, cbc_out.get(v), ax_out.get(v)))
+diffs.sort(reverse=True)
+print(f"[localize] {len(diffs)} cost-bearing nodes differ; top contributors (approx-cbc):", flush=True)
+for d, v, name, oc, oa in diffs[:15]:
+    print(f"    +{d:10.1f}  node={name[:40]:40s} cbc_out={oc} approx_out={oa}", flush=True)
+tot = sum(d for d, *_ in diffs)
+print(f"[localize] total node-cost diff over differing nodes = {tot:.1f} "
+      f"(gap = {obj_approx-obj_cbc:.1f})", flush=True)
diff --git a/examples/_bench_approx_ils.py b/examples/_bench_approx_ils.py
new file mode 100644
index 00000000..d6e1b437
--- /dev/null
+++ b/examples/_bench_approx_ils.py
@@ -0,0 +1,136 @@
+"""Diagnose whether the approx solver's objective is stuck in a local-optimum
+basin that a stronger search escapes. Build once, run the stock BP+localsearch,
+then run iterated local search (perturb a random set of groups, re-optimize,
+keep best) for a time budget. If ILS beats the stock objective meaningfully, the
+gap is a move-set/init weakness (and the LP bound is ~reachable); if not, 607260
+is robust. Env: MODEL, MESH, SEQLEN, LP_LB, ILS_S."""
+import logging
+import os
+import random
+import time
+from unittest.mock import patch
+
+import numpy as np
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+MODEL = os.environ.get("MODEL", "70b")
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
+LP_LB = float(os.environ.get("LP_LB", "0"))
+ILS_S = float(os.environ.get("ILS_S", "180"))
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+_CFG = {
+    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
+    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
+    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
+}
+
+
+def model_fn():
+    args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size,
+                                max_seq_len=SEQLEN, **_CFG[MODEL])
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+print(f"### ILS MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ils_s={ILS_S} ###", flush=True)
+
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
+autop.__enter__()
+x = (Shard(0),) + (Replicate(),) * (ndim - 1)
+out = (Shard(0), Shard(2)) if ndim == 2 else x
+autop.add_parameter_memory_constraint(low=None, high=None)
+autop.add_input_constraints([x])
+autop.add_output_constraints([out])
+opt = autop.sharding_optimizer
+print(f"[build] lite_build={time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True)
+
+
+def gap(o):
+    return 100 * (o - LP_LB) / LP_LB if LP_LB else float("nan")
+
+
+s = ApproximateShardingSolver(opt)
+s._build_problem()
+s._build_factors()
+G = len(s.groups)
+domains = [g.domain for g in s.groups]
+multi = [d for d in domains if d > 1]
+edges = len(s.C)
+print(f"[graph] groups={G} multi_choice_groups={len(multi)} "
+      f"max_domain={max(domains)} sum_domain={sum(domains)} pair_edges={edges}", flush=True)
+
+# Stock solve (BP + local search), mirrors _solve's BP candidate.
+deadline = time.perf_counter() + 1e9
+s._belief_propagation()
+s._memory_repair()
+s._coordinate_descent(deadline)
+s._star_block_search(deadline)
+stock = s._fast_total_energy()
+best = stock
+best_snap = [g.current for g in s.groups]
+print(f"[stock] bp+cd+star energy={stock:.1f} gap={gap(stock):+.2f}%", flush=True)
+
+# Iterated local search: perturb k random multi-choice groups, re-optimize, keep best.
+rng = random.Random(0)
+multi_gids = [g for g in range(G) if s.groups[g].domain > 1]
+t0 = time.perf_counter()
+iters = 0
+accepts = 0
+while time.perf_counter() - t0 < ILS_S:
+    iters += 1
+    # restore best, then kick
+    for gid, ci in enumerate(best_snap):
+        s._set_group(gid, ci)
+    k = rng.randint(1, max(2, len(multi_gids) // 10))
+    for gid in rng.sample(multi_gids, min(k, len(multi_gids))):
+        s._set_group(gid, rng.randrange(s.groups[gid].domain))
+    s._memory_repair()
+    s._coordinate_descent(deadline)
+    s._star_block_search(deadline)
+    e = s._fast_total_energy()
+    if e < best - 1e-6:
+        best = e
+        best_snap = [g.current for g in s.groups]
+        accepts += 1
+        print(f"[ils] iter={iters} NEW BEST energy={best:.1f} gap={gap(best):+.2f}% "
+              f"(k={k})", flush=True)
+
+for gid, ci in enumerate(best_snap):
+    s._set_group(gid, ci)
+exact = s._write_back()
+print(f"[ILS done] iters={iters} accepts={accepts} stock={stock:.1f} "
+      f"best={best:.1f} exact_obj={exact:.1f} gap={gap(exact):+.2f}% "
+      f"(improvement vs stock = {100*(stock-best)/stock:.2f}%)", flush=True)
diff --git a/examples/_bench_approx_sweep.py b/examples/_bench_approx_sweep.py
new file mode 100644
index 00000000..3d73a070
--- /dev/null
+++ b/examples/_bench_approx_sweep.py
@@ -0,0 +1,106 @@
+"""Build one model (lite) once, then run ApproximateShardingSolver under several
+hyperparameter configs to see whether the objective gap (vs a known LP lower
+bound) is closable by tuning (candidate pruning / BP iters / time / local search)
+or is structural. Env: MODEL, MESH, SEQLEN, LP_LB (reference lower bound)."""
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+MODEL = os.environ.get("MODEL", "70b")
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
+LP_LB = float(os.environ.get("LP_LB", "0"))
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+_CFG = {
+    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
+    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
+    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
+    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
+}
+
+
+def model_fn():
+    args = TransformerModelArgs(
+        rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL])
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+print(f"### approx sweep MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ###", flush=True)
+
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
+autop.__enter__()
+x = (Shard(0),) + (Replicate(),) * (ndim - 1)
+out = (Shard(0), Shard(2)) if ndim == 2 else x
+autop.add_parameter_memory_constraint(low=None, high=None)
+autop.add_input_constraints([x])
+autop.add_output_constraints([out])
+opt = autop.sharding_optimizer
+print(f"[build] lite_build={time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True)
+
+CONFIGS = [
+    ("default", dict()),
+    ("cand=256", dict(candidate_limit=256)),
+    ("cand=None", dict(candidate_limit=None)),
+    ("bp=100", dict(bp_iters=100)),
+    ("sweeps=200,star=20,t=600", dict(max_sweeps=200, star_passes=20, max_time_s=600)),
+    ("star_children=64,domain=4096", dict(max_star_children=64, group_domain_limit=4096)),
+    ("ALL generous", dict(candidate_limit=None, bp_iters=100, max_sweeps=200,
+                          star_passes=20, max_time_s=900, max_star_children=64,
+                          group_domain_limit=4096)),
+]
+
+best = None
+for name, cfg in CONFIGS:
+    t = time.perf_counter()
+    solver = ApproximateShardingSolver(opt, **cfg)
+    solver.get_solution(verbose=False)
+    dt = time.perf_counter() - t
+    ap = opt.profile["approximate"]
+    obj = ap["objective"]
+    gap = 100 * (obj - LP_LB) / LP_LB if LP_LB else float("nan")
+    winner = "bp" if ap["bp_energy"] <= ap["greedy_energy"] else "greedy"
+    print(f"[cfg] {name:30s} obj={obj:.1f} gap={gap:+.2f}% "
+          f"bp={ap['bp_energy']:.1f} greedy={ap['greedy_energy']:.1f} win={winner} "
+          f"t={dt:.1f}s", flush=True)
+    if best is None or obj < best[1]:
+        best = (name, obj)
+
+print(f"[BEST] {best[0]} obj={best[1]:.1f} "
+      f"gap={100*(best[1]-LP_LB)/LP_LB:+.2f}%" if LP_LB else f"[BEST] {best[0]} obj={best[1]:.1f}",
+      flush=True)
diff --git a/examples/_bench_build_profile.py b/examples/_bench_build_profile.py
index 82b31bd6..03b6a2c9 100644
--- a/examples/_bench_build_profile.py
+++ b/examples/_bench_build_profile.py
@@ -18,6 +18,13 @@
 from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
 
 logging.basicConfig(level=logging.ERROR)
+if os.environ.get("DEBUG_CLUSTER") == "1":
+    h = logging.StreamHandler()
+    h.setLevel(logging.DEBUG)
+    for nm in ("autoparallel.graph_passes.graph_clustering", "autoparallel.optimize_sharding"):
+        lg = logging.getLogger(nm)
+        lg.setLevel(logging.DEBUG)
+        lg.addHandler(h)
 for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
                 ("get_device_capability", lambda *a, **k: (9, 0))]:
     patch(f"torch.cuda.{fn}", val).start()
@@ -25,8 +32,15 @@
     "P", (), {"major": 9, "minor": 0, "name": "H100",
               "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
 
+MODEL = os.environ.get("MODEL", "1b")
 SEQLEN = int(os.environ.get("SEQLEN", "2048"))
 MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
+_CFG = {
+    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
+    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
+    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
+    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
+}
 ws = 1
 for d in MESH_SHAPE:
     ws *= d
@@ -40,8 +54,7 @@
 
 def model_fn():
     args = TransformerModelArgs(
-        dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5,
-        multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN)
+        rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL])
     with torch.device("meta"):
         return Transformer(args)
 
@@ -52,7 +65,7 @@ def input_fn():
 
 set_nccl_topo_config(detect_nccl_topo_config(mesh))
 mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-print(f"=== build profile: mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===", flush=True)
+print(f"=== build profile: MODEL={MODEL} mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===", flush=True)
 
 t = time.perf_counter()
 autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
diff --git a/examples/_bench_lp_integrality.py b/examples/_bench_lp_integrality.py
new file mode 100644
index 00000000..1c95b7e1
--- /dev/null
+++ b/examples/_bench_lp_integrality.py
@@ -0,0 +1,118 @@
+"""Re-solve the 70B LP relaxation and report how integral the optimum is: count
+fractional variables in the HiGHS solution. If ~all variables are 0/1, the LP
+optimum is reachable by integers (so an approx gap is a real solver failure); if
+many are fractional, the LP bound is loose (and the approx may be near-optimal).
+Also reports the objective with the memory constraint dropped, to test whether
+the memory budget is the fractionality source. Env: MODEL, MESH, SEQLEN."""
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import numpy as np
+import pulp
+import scipy.sparse as sp
+import torch
+from scipy.optimize import linprog
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+MODEL = os.environ.get("MODEL", "70b")
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
+DROP_MEM = os.environ.get("DROP_MEM", "0") == "1"
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+_CFG = {
+    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
+    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
+    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
+}
+
+
+def model_fn():
+    args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size,
+                                max_seq_len=SEQLEN, **_CFG[MODEL])
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+print(f"### LP integrality MODEL={MODEL} mesh={MESH_SHAPE}{names} drop_mem={DROP_MEM} ###", flush=True)
+
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
+autop.__enter__()
+x = (Shard(0),) + (Replicate(),) * (ndim - 1)
+out = (Shard(0), Shard(2)) if ndim == 2 else x
+autop.add_parameter_memory_constraint(low=None, high=None)
+autop.add_input_constraints([x])
+autop.add_output_constraints([out])
+opt = autop.sharding_optimizer
+print(f"[build] full_build={time.perf_counter()-t:.1f}s", flush=True)
+
+opt._set_objective()
+if not DROP_MEM:
+    opt._apply_memory_constraint()
+variables = opt.prob.variables()
+vidx = {id(v): i for i, v in enumerate(variables)}
+n = len(variables)
+c = np.zeros(n)
+for key, dv in opt.decision_vars.items():
+    mult = 1 + len(opt._root_to_copies.get(key[0], ()))
+    c[vidx[id(dv.var)]] += dv.cost * mult
+re = ru = 0
+reqr, reqc, reqd, beq = [], [], [], []
+rubr, rubc, rubd, bub = [], [], [], []
+for con in opt.prob.constraints.values():
+    rhs = -con.constant
+    if con.sense == pulp.LpConstraintEQ:
+        for v, co in con.items():
+            reqr.append(re); reqc.append(vidx[id(v)]); reqd.append(co)
+        beq.append(rhs); re += 1
+    else:
+        sgn = 1.0 if con.sense == pulp.LpConstraintLE else -1.0
+        for v, co in con.items():
+            rubr.append(ru); rubc.append(vidx[id(v)]); rubd.append(sgn * co)
+        bub.append(sgn * rhs); ru += 1
+A_eq = sp.csr_matrix((reqd, (reqr, reqc)), shape=(re, n)) if re else None
+A_ub = sp.csr_matrix((rubd, (rubr, rubc)), shape=(ru, n)) if ru else None
+t = time.perf_counter()
+res = linprog(c, A_ub=A_ub, b_ub=(bub or None), A_eq=A_eq, b_eq=(beq or None),
+              bounds=(0, 1), method="highs-ds", options={"disp": True})
+print(f"[lp] solve={time.perf_counter()-t:.1f}s status={res.message}", flush=True)
+xv = res.x
+freq = np.abs(xv - np.round(xv))
+nfrac = int((freq > 1e-6).sum())
+nfrac4 = int((freq > 1e-4).sum())
+# weight fractionality by objective contribution to see if it matters
+frac_obj = float(np.abs(c * freq).sum())
+print(f"[RESULT] MODEL={MODEL} drop_mem={DROP_MEM} obj={res.fun:.1f} "
+      f"vars={n} fractional(>1e-6)={nfrac} ({100*nfrac/n:.4f}%) "
+      f"fractional(>1e-4)={nfrac4} frac_obj_weight={frac_obj:.1f}", flush=True)
diff --git a/examples/_bench_trws.py b/examples/_bench_trws.py
new file mode 100644
index 00000000..4e4fbc2d
--- /dev/null
+++ b/examples/_bench_trws.py
@@ -0,0 +1,173 @@
+"""Prototype TRW-S (tree-reweighted sequential message passing) on the approx
+solver's faithful factor graph, validated against the CBC-exact optimum. If TRW-S
+(optionally + the existing local search) reaches the optimum where plain min-sum
+BP does not, it is the fix. Env: MODEL, MESH, SEQLEN, ITERS."""
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import numpy as np
+import pulp
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
+                ("get_device_capability", lambda *a, **k: (9, 0))]:
+    patch(f"torch.cuda.{fn}", val).start()
+patch("torch.cuda.get_device_properties", lambda *a, **k: type(
+    "P", (), {"major": 9, "minor": 0, "name": "H100",
+              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
+
+MODEL = os.environ.get("MODEL", "1b")
+SEQLEN = int(os.environ.get("SEQLEN", "2048"))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
+ITERS = int(os.environ.get("ITERS", "1000"))
+USE_CBC = os.environ.get("CBC", "1") == "1"
+ws = 1
+for d in MESH_SHAPE:
+    ws *= d
+names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
+torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
+mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
+ndim = mesh.ndim
+vocab_size = 128256
+batch_size = 2 * mesh.shape[0]
+_CFG = {
+    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
+    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
+    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
+    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
+}
+
+
+def model_fn():
+    args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size,
+                                max_seq_len=SEQLEN, **_CFG[MODEL])
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
+
+
+def constrain(autop):
+    x = (Shard(0),) + (Replicate(),) * (ndim - 1)
+    out = (Shard(0), Shard(2)) if ndim == 2 else x
+    autop.add_parameter_memory_constraint(low=None, high=None)
+    autop.add_input_constraints([x])
+    autop.add_output_constraints([out])
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+print(f"### TRW-S MODEL={MODEL} mesh={MESH_SHAPE}{names} iters={ITERS} ###", flush=True)
+
+backend = "ilp" if USE_CBC else "approx"
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver=backend)
+autop.__enter__()
+constrain(autop)
+opt = autop.sharding_optimizer
+print(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True)
+
+obj_cbc = None
+if USE_CBC:
+    opt._set_objective()
+    opt._apply_memory_constraint()
+    t = time.perf_counter()
+    opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, options=["preprocess off"]))
+    obj_cbc = pulp.value(opt.prob.objective)
+    print(f"[cbc] obj={obj_cbc:.1f} status={pulp.LpStatus[opt.prob.status]} "
+          f"({time.perf_counter()-t:.1f}s)", flush=True)
+
+
+_REF = obj_cbc if obj_cbc else float(os.environ.get("LP_LB", "0")) or None
+
+
+def gap(o):
+    return 100 * (o - _REF) / _REF if _REF else float("nan")
+
+
+# Stock approx (BP + local search) for comparison.
+a0 = ApproximateShardingSolver(opt)
+t = time.perf_counter()
+a0.get_solution(verbose=False)
+print(f"[stock approx] obj={opt.profile['approximate']['objective']:.1f} "
+      f"gap={gap(opt.profile['approximate']['objective']):+.2f}% ({time.perf_counter()-t:.1f}s)", flush=True)
+
+# Build a fresh factor graph for TRW-S.
+A = ApproximateShardingSolver(opt)
+A._build_problem()
+A._build_factors()
+G = len(A.groups)
+nbrs = A.nbrs
+unary = A.g_unary
+order = sorted(range(G), key=lambda g: min(A.groups[g].members))
+pos = [0] * G
+for i, g in enumerate(order):
+    pos[g] = i
+gamma = []
+for g in range(G):
+    indeg = sum(1 for h in nbrs[g] if pos[h] < pos[g])
+    outdeg = sum(1 for h in nbrs[g] if pos[h] > pos[g])
+    gamma.append(1.0 / max(1, max(indeg, outdeg)))
+
+msg = {}
+for g in range(G):
+    for h in nbrs[g]:
+        msg[(g, h)] = np.zeros(len(unary[h]))
+
+t = time.perf_counter()
+best = float("inf")
+best_snap = None
+for it in range(ITERS):
+    for forward in (True, False):
+        seq = order if forward else order[::-1]
+        for p in seq:
+            if not nbrs[p]:
+                continue
+            agg = unary[p].copy()
+            for r in nbrs[p]:
+                agg += msg[(r, p)]
+            wp = gamma[p] * agg
+            for q in nbrs[p]:
+                if (pos[q] > pos[p]) != forward:
+                    continue
+                P = A._pair_matrix(p, q)  # (D_p, D_q)
+                mm = (wp - msg[(q, p)])[:, None] + P
+                mq = mm.min(axis=0)
+                mq -= mq.min()
+                msg[(p, q)] = mq
+    A._decode(msg)
+    e = A._fast_total_energy()
+    if e < best - 1e-6:
+        best = e
+        best_snap = [g.current for g in A.groups]
+    if it < 5 or it % 50 == 0:
+        print(f"  [trws it={it}] decode_energy={e:.1f} best={best:.1f} gap={gap(best):+.2f}%", flush=True)
+trws_s = time.perf_counter() - t
+for gid, ci in enumerate(best_snap):
+    A._set_group(gid, ci)
+print(f"[TRW-S] best={best:.1f} gap={gap(best):+.2f}% ({trws_s:.1f}s, {ITERS} iters)", flush=True)
+
+# Polish TRW-S result with the existing local search.
+deadline = time.perf_counter() + 60
+A._memory_repair()
+A._coordinate_descent(deadline)
+A._star_block_search(deadline)
+polished = A._fast_total_energy()
+print(f"[TRW-S + local search] obj={polished:.1f} gap={gap(polished):+.2f}%", flush=True)
+print(f"[RESULT] MODEL={MODEL} mesh={MESH_SHAPE} cbc={obj_cbc} "
+      f"stock_gap={gap(opt.profile['approximate']['objective']):+.2f}% "
+      f"trws_gap={gap(best):+.2f}% trws_ls_gap={gap(polished):+.2f}%", flush=True)

From ba02ea52b73688f997b88267b76dd807947bfa1e Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sun, 31 May 2026 23:17:02 -0700
Subject: [PATCH 21/27] Add real-GPU LLaMA3 training sanity check; drop stale
 loss-curve symlinks

examples/_sanity_llama3.py traces LLaMA3, selects a strategy with the approximate
(TRW-S) solver, applies it as DTensor, and trains a fixed random batch for a few
steps on real GPUs over a 2D or 3D mesh, verifying the loss curve descends. Also
removes three dangling loss-curve symlinks left over from an earlier run.

Authored with Claude.
---
 examples/_sanity_llama3.py                   | 215 +++++++++++++++++++
 qwen3_8b_autoparallel_30steps_loss_curve.png |   1 -
 qwen3_8b_autoparallel_30steps_loss_curve.svg |   1 -
 qwen3_8b_autoparallel_30steps_losses.csv     |   1 -
 4 files changed, 215 insertions(+), 3 deletions(-)
 create mode 100644 examples/_sanity_llama3.py
 delete mode 120000 qwen3_8b_autoparallel_30steps_loss_curve.png
 delete mode 120000 qwen3_8b_autoparallel_30steps_loss_curve.svg
 delete mode 120000 qwen3_8b_autoparallel_30steps_losses.csv

diff --git a/examples/_sanity_llama3.py b/examples/_sanity_llama3.py
new file mode 100644
index 00000000..71fc1122
--- /dev/null
+++ b/examples/_sanity_llama3.py
@@ -0,0 +1,215 @@
+"""Real LLaMA3 AutoParallel training sanity check on a 2D or 3D mesh.
+
+Traces the model, picks a sharding strategy with the approximate (TRW-S) solver,
+applies it as DTensor, and trains a fixed random batch for a few steps on real
+GPUs. Pass: the loss curve goes down. Adapted from example_sanity_check_qwen3.py.
+
+The batch is data-parallel on the `dp` axis only; any other axes (`cp`, `tp`)
+are model-sharding axes (the solver shards params/activations over them). Logits
+are vocab-parallel on `tp` and replicated on `cp`, so the loss is reduced over
+the world and normalized by global_token_count * (world_size // dp_degree).
+
+Run: torchrun --standalone --nproc-per-node N examples/_sanity_llama3.py --mesh 2,2,8 --model 8b
+"""
+import argparse
+import logging
+import os
+import time
+
+import torch
+import torch.distributed as dist
+import torch.distributed.nn.functional as dist_nn_func
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+
+_CFG = {
+    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
+    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
+}
+_NAMES = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}
+
+
+def parse_args():
+    p = argparse.ArgumentParser(description="LLaMA3 AutoParallel training sanity check.")
+    p.add_argument("--model", type=str, default="1b", choices=list(_CFG))
+    p.add_argument("--mesh", type=str, default="2,2", help="comma-separated mesh dims")
+    p.add_argument("--global-batch-size", type=int, default=8)
+    p.add_argument("--microbatch-size", type=int, default=2)
+    p.add_argument("--seq-len", type=int, default=512)
+    p.add_argument("--train-steps", type=int, default=10)
+    p.add_argument("--lr", type=float, default=1e-3)
+    p.add_argument("--max-grad-norm", type=float, default=1.0)
+    p.add_argument("--seed", type=int, default=0)
+    p.add_argument("--solver", type=str, default="approx")
+    p.add_argument("--verbose", action="store_true")
+    return p.parse_args()
+
+
+def init_distributed(args):
+    if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ:
+        raise RuntimeError("Run with torchrun --standalone --nproc-per-node N ...")
+    world_size = int(os.environ["WORLD_SIZE"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+    dims = tuple(int(x) for x in args.mesh.split(","))
+    prod = 1
+    for d in dims:
+        prod *= d
+    if prod != world_size:
+        raise ValueError(f"WORLD_SIZE {world_size} != prod(mesh) {prod}")
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    dist.init_process_group("nccl", device_id=device)
+    mesh = torch.distributed.device_mesh.init_device_mesh(
+        "cuda", dims, mesh_dim_names=_NAMES[len(dims)]
+    )
+    return device, mesh
+
+
+def placement_for(name, *, is_output):
+    if name == "dp":
+        return Shard(0)
+    if name == "tp" and is_output:
+        return Shard(2)
+    return Replicate()
+
+
+def make_local_tokens(args, mesh, device, vocab_size):
+    names = mesh.mesh_dim_names
+    dp_rank = mesh.get_coordinate()[names.index("dp")]
+    dp_degree = mesh["dp"].size()
+    local_batch_size = args.global_batch_size // dp_degree
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(args.seed)
+    tokens = torch.randint(
+        0, vocab_size, (args.global_batch_size, args.seq_len + 1),
+        generator=gen, dtype=torch.long,
+    )
+    start = dp_rank * local_batch_size
+    return tokens[start:start + local_batch_size].to(device, non_blocking=True)
+
+
+def vocab_parallel_cross_entropy(logits, labels, *, vocab_size, tp_group, tp_rank,
+                                 tp_degree, normalizer):
+    local_vocab_size = logits.shape[-1]
+    vocab_start = tp_rank * local_vocab_size
+    vocab_stop = vocab_size if tp_rank == tp_degree - 1 else vocab_start + local_vocab_size
+    logits = logits.float()
+    local_max = logits.amax(dim=-1)
+    with torch.no_grad():
+        global_max = local_max.detach().clone()
+        dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group)
+    shifted = logits - global_max.unsqueeze(-1)
+    global_exp_sum = dist_nn_func.all_reduce(
+        shifted.exp().sum(dim=-1), op=dist.ReduceOp.SUM, group=tp_group)
+    mask = (labels >= vocab_start) & (labels < vocab_stop)
+    local_target = torch.zeros_like(labels, dtype=torch.long)
+    local_target[mask] = labels[mask] - vocab_start
+    local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1)
+    local_target_logits = local_target_logits * mask.to(logits.dtype)
+    target_logits = dist_nn_func.all_reduce(
+        local_target_logits, op=dist.ReduceOp.SUM, group=tp_group)
+    loss_sum = (global_exp_sum.log() + global_max - target_logits).sum()
+    return loss_sum / normalizer
+
+
+def print_rank0(msg):
+    if dist.get_rank() == 0:
+        print(msg, flush=True)
+
+
+def main():
+    args = parse_args()
+    logging.basicConfig(level=logging.INFO if args.verbose else logging.WARNING)
+    device, mesh = init_distributed(args)
+    names = mesh.mesh_dim_names
+    world_size = dist.get_world_size()
+    tp_group = mesh.get_group("tp")
+    tp_rank = mesh.get_local_rank("tp")
+    tp_degree = mesh["tp"].size()
+    dp_degree = mesh["dp"].size()
+    local_batch_size = args.global_batch_size // dp_degree
+    grad_accum = local_batch_size // args.microbatch_size
+    # logits are distinct only across dp (cp/tp replicate the per-token loss),
+    # so the world all-reduce over-counts by world_size // dp_degree.
+    normalizer = args.global_batch_size * args.seq_len * (world_size // dp_degree)
+
+    torch.manual_seed(args.seed)
+    model_args = TransformerModelArgs(
+        rope_theta=500000, vocab_size=128256, max_seq_len=args.seq_len, **_CFG[args.model],
+    )
+    trace_global_batch = args.microbatch_size * dp_degree
+
+    with torch.device("meta"):
+        model = Transformer(model_args)
+
+    def input_fn():
+        return torch.randint(0, model_args.vocab_size,
+                             (trace_global_batch, args.seq_len), device=device)
+
+    mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+    x_sharding = tuple(placement_for(n, is_output=False) for n in names)
+    out_sharding = tuple(placement_for(n, is_output=True) for n in names)
+    print_rank0(f"LLaMA3-{args.model} sanity: mesh={tuple(mesh.shape)}{names} "
+                f"solver={args.solver} in={x_sharding} out={out_sharding} "
+                f"global_batch={args.global_batch_size} microbatch={args.microbatch_size} "
+                f"grad_accum={grad_accum} seq_len={args.seq_len} steps={args.train_steps} lr={args.lr}")
+
+    t0 = time.time()
+    with AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True,
+                      solver=args.solver) as autop:
+        autop.add_parameter_memory_constraint(low=None, high=None)
+        autop.add_input_constraints([x_sharding])
+        autop.add_output_constraints([out_sharding])
+        sharding_placement = autop.optimize_placement(verbose=args.verbose)
+        parallel_mod = autop.apply_placement(sharding_placement)
+    print_rank0(f"trace+optimize+apply took {time.time() - t0:.1f}s")
+
+    parallel_mod.to_empty(device=device)
+    parallel_mod.init_weights(buffer_device=device)
+
+    batch = make_local_tokens(args, mesh, device, model_args.vocab_size)
+    inputs = batch[:, :-1].contiguous()
+    labels = batch[:, 1:].contiguous()
+    input_mbs = inputs.split(args.microbatch_size, dim=0)
+    label_mbs = labels.split(args.microbatch_size, dim=0)
+    optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr)
+
+    try:
+        losses = []
+        for step in range(args.train_steps):
+            optimizer.zero_grad(set_to_none=True)
+            step_loss = torch.zeros((), device=device)
+            for mi, ml in zip(input_mbs, label_mbs):
+                logits = parallel_mod(mi)
+                if torch.any(torch.isnan(logits)):
+                    raise RuntimeError("NaN in forward output")
+                loss = vocab_parallel_cross_entropy(
+                    logits, ml, vocab_size=model_args.vocab_size, tp_group=tp_group,
+                    tp_rank=tp_rank, tp_degree=tp_degree, normalizer=normalizer)
+                if torch.any(torch.isnan(loss)):
+                    raise RuntimeError("NaN in loss")
+                loss.backward()
+                step_loss = step_loss + loss.detach()
+            torch.nn.utils.clip_grad_norm_(parallel_mod.parameters(), args.max_grad_norm)
+            optimizer.step()
+            with torch.no_grad():
+                logged = step_loss.clone()
+                dist.all_reduce(logged, op=dist.ReduceOp.SUM)
+            losses.append(float(logged.item()))
+            print_rank0(f"step={step:03d} loss={losses[-1]:.6f}")
+
+        print_rank0(f"\nloss curve: {[round(x, 4) for x in losses]}")
+        verdict = "PASS" if losses[-1] < losses[0] else "FAIL"
+        print_rank0(f"SANITY {verdict}: loss {losses[0]:.4f} -> {losses[-1]:.4f}")
+        dist.barrier(device_ids=[device.index])
+        torch.cuda.synchronize(device)
+    finally:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/qwen3_8b_autoparallel_30steps_loss_curve.png b/qwen3_8b_autoparallel_30steps_loss_curve.png
deleted file mode 120000
index c8413f8d..00000000
--- a/qwen3_8b_autoparallel_30steps_loss_curve.png
+++ /dev/null
@@ -1 +0,0 @@
-/tmp/qwen3_8b_autoparallel_30steps_loss_curve.png
\ No newline at end of file
diff --git a/qwen3_8b_autoparallel_30steps_loss_curve.svg b/qwen3_8b_autoparallel_30steps_loss_curve.svg
deleted file mode 120000
index babd3d4e..00000000
--- a/qwen3_8b_autoparallel_30steps_loss_curve.svg
+++ /dev/null
@@ -1 +0,0 @@
-/tmp/qwen3_8b_autoparallel_30steps_loss_curve.svg
\ No newline at end of file
diff --git a/qwen3_8b_autoparallel_30steps_losses.csv b/qwen3_8b_autoparallel_30steps_losses.csv
deleted file mode 120000
index 47d30691..00000000
--- a/qwen3_8b_autoparallel_30steps_losses.csv
+++ /dev/null
@@ -1 +0,0 @@
-/tmp/qwen3_8b_autoparallel_30steps_losses.csv
\ No newline at end of file

From cbd95757b785e94918ef2869badcf48de990c7a1 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Mon, 1 Jun 2026 10:14:17 -0700
Subject: [PATCH 22/27] Drop committed loss-curve/profiling artifacts; ignore
 png/svg/csv

Authored with Claude.
---
 .gitignore                            |   3 ++
 qwen3_moe_mast_20steps_loss_curve.png | Bin 19666 -> 0 bytes
 qwen3_moe_mast_20steps_loss_curve.svg |  68 --------------------------
 qwen3_moe_mast_20steps_losses.csv     |  21 --------
 4 files changed, 3 insertions(+), 89 deletions(-)
 delete mode 100644 qwen3_moe_mast_20steps_loss_curve.png
 delete mode 100644 qwen3_moe_mast_20steps_loss_curve.svg
 delete mode 100644 qwen3_moe_mast_20steps_losses.csv

diff --git a/.gitignore b/.gitignore
index 1a6228f1..4936ecca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,9 @@
 *.egg-info/
 
 *.pdf
+*.png
+*.svg
+*.csv
 
 build/
 dist/
diff --git a/qwen3_moe_mast_20steps_loss_curve.png b/qwen3_moe_mast_20steps_loss_curve.png
deleted file mode 100644
index 8b4d9c43f227e00009f42077c6b257e19591586a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 19666
zcmch<1z1&GyEZxj2|+@60hJO1q(!6~MU)nh?vzHlK><;bl5Ujl?pPq*5{piyVbL9n
zf2`%Zzx{pR-us;YKj&P>%j;sU8Doxkp8L6<81s{YoFpOsZF~rV2w%Q<rU*e;F%X3L
z2oDpSvHYqu4M8%oFQ17iyCfi{ojphoXxn#ezWEb%Jv5G>Ba_P3-zwq3m-xUUCKfAl
ztLugl-rbKf<9HvhVW^iFZ?U>7b+P5LDwt6GJUIKk{qg1PuiU+hUq|<vXrsHij!vAk
zW<Ab2j}pFyhhu}+rEuShU;}@l7liLwz#rvLcL>0r*LOQWXGA0MO27|=2milaa(DW@
zWTh*cvR|mVC5GRC2L;-z)IhuizLLQ<6E*9@v3ta{Lcu|S-BIvTy>}k($hym|;QJTi
z*pUT&*RB)OAg1i~`jO3*TQlC<bMx!RetAjipX+q9o|ktems`z8-m7I~S4=a4x3t_p
z(Z4LL^&Uzs?c?y3Jg+l2cz&Y)?fuD-jc#~NoTFT(P4`4=9OQiuH_dt&mzY))Ir?E}
znN84ZmnAU3ec<jEuBY-=X$5OS;-x+7AUnHZ+qKQj%@fbv2*%fSR^1uK($S9`voyrF
zrgAS{F(ZToxY_oOm=oFt)QdEvJt<>^73o?6v$7s^k8aF3AGh`XT#30k+7l}zC}?NY
zf2i}$1DjxJYdk;IS&p6VnO<VZJsQEs#(%l)E%4VI+4#z<FG$=$p5>Rg4|U!3Ct%bh
zPWkIS%U>kOxp^L}56Qfqa_-B&8RTh2T-^@O(`U(%<Hg$DTd?(1Qfo_<v%Y$pnVS51
z$=lsKN~X(~#pfZj8BuK697kN2TGbV%mBACq@0QuWkJwCDzq4a)nR><W-~k~upPaxu
zA<x6}<LuIQ`9$pox8|>|Lw2sqPwZ-1TGFzzRzku5TppFYnKzH->xbeF;vEn3ri&c6
zeRRxdwcV7N2e+;CPG1yGR~@)U7t#uTi8yL$X=`n16Y}`Wb#a1_?pf|qP&yDl?vEpw
zYMRHs0lgCrQHput;YuL|)I?hv-Uk~^LAm5Ag^I86!XhHdR{CKLYn;4{onsN-5H5wH
z4Z=dTdAb~z3r(y|nTG{x`?-P#2OB4W7*C(4|Kb8KoT{)-K1?FZ<*D;HVM4&yChN_N
z-)z51^lr;E^gl^W;$L^>YehaWQ0Q2_SXb@oPwdvjyKcAWHkyQ`6h{};RqJ**_vK|O
zr;&?&XWV1^q`yjKq7xG8oPMh$CaxoA%(v^Zk#QP18ZXxx2xVx^6crT<y_(bhT93SU
zHnsY@k7}I8&h4_oQTps~ZEZ19`)n@N{c;3Jv^w=I|6IBKFuyGM@#ApwFbWLlmKgcS
zi0K?LF>QS;a&@S*+xfzMu|Jwei9TBKfv_5{X^G%Kxy9Bu>_Al})r?!@y@`W`&XteT
zKf5pQ+d114<SAR9rVZ?tj@o?q;4hiBWPGXhzy_=$t9nuw+j|Ftkt`rjr;alQX0<8@
zjr}4QEpRUTSjAS*v%3gEXAT!c8|!nqXvXmF_8`jPERC0LF<_uub0Q{W9{b0uXZqK7
zb6-*t(>$j^F1y^8jHZD%<hPvLU=2ySEls88j@n2~3e`BB7kMG5#!IVfOo$#XjaI6T
zHrI|gUzBKnxOPiPwHonIu_`nnWVtsXWiAjZ;&nV<C3pJ0JwVznpA=SMZ`41MY9Wa*
z)z;F&IauFOUoK1!DD~EzD)!8kiPgGGE0V2{T3oglo)JVJawaxG!{yZV{sFT;NwvKz
z?JVr&M78_uH#Q+yWQ8OV<@Ta!emY`hB?heh=7jV1T6sLX&NEk+DvG@$i``fC6|C_l
zeF-lM=G%HF58~=#ADAmtE^TCs^y>m#sObf8VsG!rs+wUmtD`+T&NR1m<kj}&*22|C
zzo)bKwX`_}1Oy}{+Y<ymg?Sfpo^aNHSGylIgTB<+3xD>o+{V&<Cc&y%?$zJWJZe4l
z`SWKdXXhhU<rI_hF6xQ$uJFmpeN4sPj;v9uskqS4J65B4D(W-3scVyn*A__Y>F=L*
zlotAz@7aiWo=)ZMs@S+tM!?ob?e~w(c9C&#tl5+K?BlqM^a*8eN<91LZogHs>Janu
z1$+F~1qHLjYlbT0<Y{~1ypBIYO1(78j3+nG=7~Rh&%wM}&o#d}J8cQnU2YFpjM@l$
z?aJVV!8xVFel$3mE3f`6FpgrOVXTj!dfI!cU)d4Z5u7Z@X}I~FbD53)0sRRD>(uHV
zcW<1>if%^R`sj{0-d2JR1M%%M@5E{u81GcIyEpO|*(H^LqX^46eEnE!hHr1VTPZ_%
z=d|2HIY;3+@}BRf01TBzXUaVD_D!r^r~I{d6IT@#(-lwhl9C1w0fxyZ*WdBm?puAG
z_vX6U@&Kvnf^W!0{3lxB&vj&O%d=Q=-MGpGd3UT5U@p~SLx&==F-|~28RU~6+@}}z
zzV3Lw5PMxj*8I?Hc^j9QP=%6`a<oBdt2o?5r}pjo!TK=(x<eY?!J6hvHXL0J;T|Bo
zonj2p&V3%ju0gbgy{)b3y)aknfk9YWkiYbdOt<Gqg@02BzvrR&1wEqe;_*5FMEPVP
z`4=)C*U21_zZPTdDc}}^PRldC85E)B&!2mq539S@{&csQ?Nq%33$btGA@wq4ztY~`
zPATGlD3~QbGnFw)PDxQ|)lOq3kSssbM=f&TS?IHG+rD}6>eaxOhrhCitde&A3q<9+
zNA_vA5@wM2F`V9=*_;$TOoQg%4GlCrM=nNZk$Cm?@<{sZp{iba(hO=r50+m|smJJl
zK&NO#Tusg+bSa-$vG>1>e+U=Z$aV>XuXB5!8W;Kogh)uX12j$)^3Lv0p?w|{Z?4jm
z!Fln^_N1%4ja2}-v#hIHIQ!*Oko1cecp)yfyZa}z6s(u~-ASNnuJuQa{CFirIrbFq
z7ZC<XuTxy3j?iye$%W_J8~)ORAE;h7|G1Z?`wEOEx=<@RBm!%**qdi};gC&Z!IQ4V
zHy}(GS*YvMf<@;St$KI@ldEws=>IyKDb4Cbr&{2QILU9iTG1>i|4gFYQoZ)LCkz4f
zCc>$~i~4fi#yjmvBQQVr+T{*!@rt{xY-|Xwo!M}jJlVp{|81I|n2_K{Ldk98eY$Z{
zPFWdDCBW!sYwu_^xoddwyO~ySWB;{xw*EcILrdu;Sro*7h$O&YIJz$WOtO7{jc0#w
zG1mmH7v&s$`eB4Jv(C;ayb?57V&cAc83pFH_UMNqe=89F_zHxu?QzWh&mm6f@aG4z
zg|_R#^&G`-2c3r62on+#gsrDU-Hvw`b9*O`(u^I*3snp@E8n%`g+(!)ysPtE-n@uP
zcFdhgkw_L3VNol2%zK|2rr&eWU#!Ib{X0N-yVaJYpL40eFj0uV!94fAIpICDCL%{Y
z;f9!Mx>q}TbXiloe~Tc*WqZia%p~zH8CYQVu=q-A;)}Nl`#3%u<0Wi~BEQ$heWksU
z8{DQ%%=kx2dTMxkV(K69^FPPCuFzvMnn5<&M=khPZx2ku2cq*D@oL3e$crSN9k9U1
zAADa6dF?9C^d+(8HW}I(c~d$h3A)l-DHG;B+=^Jz3Q6W?zIYsZx3!ow)1PEufFqg>
z7JHs`xE0WobkrW6X`^Gh5Q;o7S3TOcEqqlzSUGzxKjFH%3nz>xtRtQ3>@*u$NPdEW
z<SQ|yoVr8^H1um<8xg5G$<+4XdiQ{N)Mhrja=$NW@BtU4D(jo!A~~t#porrJ)fA^7
zCh?D|g*t_5^{<jbEe6V7jpX47o0TrJdLi#hO(1I|mv6(FOuH5oYtl?s8hK=I7wWcO
zKK8Vh%FG>2%dZp7*e`@HPs_5vTVBDAD+TGEWyoB{e&|nNEUmlHT7Q(dd)_^~GVL9+
zPv#9%XnL@hvAEx*ZxAN_k!yVx`K>}d{Db(%3^Bs;B)De?Og**KrTm6Hj1Wm3c<bK5
z5i`}kz7Rs_XfwcOtju$Gj4?=LRgEVLkzrNa%PNrAlf+*E1~73!tU1EAx$hc(BhG-Y
z-ma>uYGJSQOg5x%*L@ctl!1zLh`PvznQPeWr5YXX$js%sa=lb5u!ZKz`NO5|lGD+p
zS4!E4fPwkSjZfj$UszNa7%0uQm5?rNUgLafc`!K36v=KlqVMBKVx({XHGI~(+xvN*
zVs^qg&zn_w)xn<J+IY<iey+s;2D#*LZ}-mds@>VX1Wpr6g~E*UUy)1^k(JJyu9JIr
z0F%`^zTnB_baSb4ud01H`RWx%#<hhh|G9U~UNv5v!x6rY9w)ApW`;W@@O7jo?kPob
z`D2&ug8G03E5sLgk+tkexYJ7V??Rn=If(~LZp&HW<*-Ka`wTEy%0Y(-foXz3wemw7
zgs|UpR{j;Eov)FzeLf!~1w$$2&hgK@6!Nl2zP|}O!uni)65se~XxVzU-li*CX@?75
zIO1U9N%>JZ$YHmNw&6Ul{M_?9Vkv*J{)||&$VGyQsgecuqaj_&Bc*h2KDFcJ&{El(
z(Q->T@rBgu=!VVRH4=8+2DihhFAWP%kVWyxy8&f0c1N##Cw85Fe{hr)30nm!<Kku#
zc4#?XoT+iKba`g7?U!B3V>wYU<YD{^%<i@cyIQ2`;l^%qNHWYtfBPmvSA-9#?K;!k
zT(A5xX!tR+Y=cy1xySfdc&_qPgM}`jy3U(3B5qo_Vii&k=$;uED5@P}naPVA{K8!{
zrxaX82;kVs%m`MO(biN|;az9&dqLiLiYx8C>4Chvb{h0kcln~&eU^@{CB0j9(rKgE
zW?gqTefAkw?rX2vqk3TC(}MBno<(*o29mMfy@abp#l+mF4X1nhVQR)^wq|}~BZ1SD
z$n0(UXyI>QS6S5VBqOT_OX6xirS`?a_r~{lbgdI%$>t(G<}0Q5Bco$t9M1ZtY1`$-
zy;t6Nc$}BKUJnlsZ%6XBMBE*<9Ajc%9Lehp*hQKIQGPg|Oe}F~j`FInO}BQ4%2$bB
zoyEzfF<a`1eNIIAbKP6ldba#>Wk(5!+>nlv3f@lles!|<grd%c==GDmRj2#9H6G=j
znL0R`+s1{xC;h%?9k_mCx3I8~=2-`9r<|M||7EZAJ$rT{+1v2wf@1NHfS=kW7X!y(
zS_4iWe4<lB_7TvGk)|&!c{xU(AlI{r1I<(1`wt#y*U~VF4D0kNC+QQ`+i$6?BnC!5
z^&^R7j>^^A>1|mn4mVv`%>o|Q>bW8Z8#lL#cV-`LlWZ(|4{$)i0@|B?iJq|cCkaUw
zyl`J<8)6RYKWK%mv%cWJ5a2W2Po3F`TZ+>E?i^fGv<0*{+_ZlIk(PG<<dTz(jSa`X
z;y!R2{b$+lv#Zhh9a|tWVs2vTa(P;m&9q*EdDzWE95ceqX~Pviyl5SL!U{L%Jes@%
zzt_8%xVI-S63ge>Q|uXEk*Cd43CE{1F!)_C`(!)ST$W$tllVulJYh<fqwJH3yhZ97
zH<=wPJ%&Ui*1XhuyuuvL!k!!{D`qDO+YD8zsv@vy?{{`~)*eyc=C@1O|Jl+Et>5YR
zkt3X^t2r~{y^>XTKYU~5(-8ajnRBnMudPpOB;KdFMM~`+_RJzfHSm~TKU6B~I2*ju
zxak?dhP}?+VbT-R#9%{XHrWe5IQ4Hn4ZIA%p`EU@D#s33P2fm9)D_>Fk>3%Ld0u8s
zbKm*OQh#)%4&pWr@eqxkfx%#NN9B;&)vH%SXklhjeG#3l$N8m$eH~fM7LISH8a%fU
zhtp?v%r9cU-lm9lmN^4hOm}%I^Fk)V^r*5x^ZOolz3qnVPeISa_jTX>uLInr^saPd
zYRF)GZ9z^7BlKH*o^y3OS@`X+JN90;I}5+`8;qAPUuO2zRkNq425&Uuc-`~1$+Ye@
zvFO^`Tgv3(<2xBhtrW?hzfR05K+W6(<K^Y!J3zn>_P<#%lV9-Ljrjl<D$Qu8Qi<4|
zm@16v3Cdie_|Waw%#rMxiW9*e=L<(7Q5GWcp`nl7u2G%nkFrlx*ED!CC{#awFr}rP
zjv(~5Ar&|3OURb#`~^fBpzU?>jWN$VbR4hvMFlW;vd)&Ay{EdP1)M2Y{L8CQ5ey+y
zalB-~CD!`;JKMH4aKRT7n?DQE!zw#R3zf8IW@-!z>*@_d4jIzEA`MKVjE#m_<G8<W
zmAcJ!aJXI~pGqzwxEW!jx`z*Ui}#((J|PoA8{ir?UK6#(k#aLaW%h|9VIO99AG`#z
zM+RTxyqLU0c2CG|W1*kabfr*uX<;E^XBJ0l`OB?YY1a&`>n}SyU%s^Y{<XBURB*=?
zI46k>c$CW876>E6B3|XjqY8)TIqAQCg))Xq>!sW2dX1cBUiU>JEyB+6fjMR{pSBuk
zY4+KF9_nA-8}me?!SSpmyOg2-nDxA=86Xq87XIq~>r3EzNXbXCYJXQfVlNZ<3Fx`_
z$>3mx!&Nq=oT}j()oKGiI=?CE7!Jb^Op#sPv}3@FboKY66ZvvsF`Xe97NXnW;Rwjq
zZ-#!E<odbr!)YQK4qJByrJjV^Eayx8_=?RMw+j;o2N@X|5Il%>Kcu{EchDOzSK{Wz
zjeHbo^ZWSi7{uuB0Z;c0Yj$&3k6~Spm&h~e(Xrj`*a3tPaA5Bw-lr3>V4RnQvf3XT
z|5^&>w@K0xs;zfbI(+w3GMYtq5AeX!W%k~LQiTci8owpr#s(#DaB!>wQ*}|kNl2;C
zBd;Oe@ez!h=VB5ol%ewIOoN)}E9Qyk0>yZVr!$a*?3;<{IkKM^0iaTvd3gM=A-KGj
zx3_bYO1~lEu)h<YxwEqagg@lAfJD-i6|mve)HAESe1^|m@wb8LJMm@-`4lAJwPp!x
z5@FJeVXX1m$)2)Bsf{5+UrT#C?dft$A&gZnWGL40?M;5WDW7ktm0J_eh_-x+SFga5
zCD)QrQSD~b*2H*UU%k+$r@>LfwK;rWzPQ&}`xgGb5Jv5_*5eO?L#w3}V6H`%V<xzA
zdce|N688erC+KpF@X*m~wY#*AfD7mO(Tkp`+5jtujNv)RwfW#DDd=)|-rA@yvZ@*x
zF~R5;6B8pl>6BR}6$X5SaizAdCZWXRh0c20zJ*Iq&f`}Lzcc%joL5eR4olTbTx4z?
zJ-sSh93iaLz#djX8$kGTwQo4;^Q$AEw@u9*krv$ebRR;Jcs=%?on%9ocVU<O9O>21
zeHUF_jV~9MuQNt6OcpQuOl5z+@TRnS%@lcQF-=eROvZAmBJoVa<NTFILxx#4H>)C#
zqM0{1J4j{`U7(hGN!R+B{Fac1-f2Vv{ZpL%&7TL9t0HBo?egK9Ix5u$<61L1%j`NT
zHT;G%N1j{Dv^d1Kh3Od?SHXH&!B*%HTT?qj>C7W9wmhFyJ{|1{DN?5h0OPWmE*_mT
zcK{Cky?}k5KmL}2sBm(f1v0@I2%fB_bE2^OJQ!Kl%A4wZbA?$Np-TY@=G=_(xI7Z^
zwTC3)SB}q&`=efa>DD>%<fl1Y?Ja;sUzU6A^-bf77wC8+?{@X}!UZx_8|qw{R9Q4j
zHH<G>J!SSCL;8%R$Y#rregbwuE9kMmakAGF`*?ZxkZJA6D}U{^u-S;3&U*&!NM<EF
z;6QY%<j<CF{RFPo_f*5FscDZ{W`$A>z4pT>n1mJ6^F%;9c2WayJk^}Xb=QG+f4s2D
zN$kwy4f`P$wB|jMaUxJ_=uZOd<9LZ1IipsV(*YA=vLbXj_RCMCHf4-rUG6#k%vCy-
z!pW&$ze8Adxnw2E&BAKJKm;>(d#GrxTh+Hv3xt35l^uElvsSL%N&pd6auBo8ztG6n
ze8RCPZRq)$DO$JOsBc(zIho&Hn2zq79?}DZF^*qH_fI;m9c&V-XylTMZISix820_(
zTxRdA%(5EHdRI57zE-24XmHx9f4=ffjCN$weLK0EZZvJnLsfP3{fQ?gbJSVMQ_U0d
zw<u)@@TWL~6d8nt8w`&cKosPG-*#uX`#~~nmAwiJI$WCw(j*kNX0RH~R(G6F=<FK7
z%EA^ZDY^>AZl4(v9>s(2y}iB1rfV^abq(5>+=X#>OcJh*Jh8hES@m@M_%KBr_y%bf
z4KU50g!V7y8oQlBb)lgn;u1*Dc*vCfqs#WAl~gytFCv&TRb5(|7jt=fk}B0s9kue_
zg!Nh=Lsd^8Jt{GB2wK0z`S!HVzx$f-Yv3Vm?(djP_h8%FUQvgjJO+Atr^&wh_{y#7
zx13LY&8J?IyH!_TFJ+GT{x<w9%ZvJ+G&a^x$xx6^@L@_n@b<#`5F8cVgatAOc^^dd
z!5`7c!3P~}?d_L>Z=biuiAhM0zM04v3NvI_%p{`XpGbpK&_?8HpQWnUFR{Ork&!ZE
z01xRstGO}`=sB|9any#hEm^QC-8$bKgb8M+-Sx#k#*9^M&U%d2NQ4hbR<HuURzi6{
z(qfcwQzj$_^|U+-t&wiMTfOaS*E>%blV;7cio7IGcgF+fbL0<HoG=6g$4_=&`_^y)
z^GmRBbZfXWfWC2dPA*Utl^78)Jh|h(JaYj6xI2o8ZHrN}rUi(!9|<<WLauhIRG^c#
zcibbZqlTxzU--Mn35&jYNH-lorG9k2#G7AOC<D;6N-9}v(kJ+KB0+<;9K?#Fni9zi
zQMn(dwq3Cbg(l3{$JGh-BrIEe&hv@`KN<!r0B>FVYz#x}!9lEeZ=tW;95Ks?xB?#e
zQsfcZo4}`(**_^4$Gr^vEdaz|rK>;)I+MuCpTx9Jn+gE+BJ-$Dy&E-^9hcxQkkr7w
zwpr%#8a!Xb*=Oec384G2Ip$&jS1xW}G8YFUU$%6=O)~7`X-drO_i814@(u>Shp)x$
zPr43}zLL{5p6vh>&bWIgE6tJ;SoWPW>X|@WV?5Tc(5a$deLCRhza@zjA9pyd3w5ZH
zuH;??J*qoIB5Du0EGE^J8u}zzg=edeQjoY#n;fF2qPntrom~f3brS^9;`|O587}_(
z<s#<g=Y@oWeSR&^T=52BuM9Ks=&Yw3&OqWIP&G@b*yLx-r=jq#U(0j>q4mq~NDQ7n
z>vI3=L>bjFOoBX(Q1d+#ITG<N$7YJ+;@|5ynV8%VZC0rtyw_EaFRTQP4%c`b+S?Wz
zwpQ6XxXgOCRaoCEzd`^F+h0Q(XMd$l5TKnZAFcF~@L-N&MkH_;f;_>h&sbFzKAh5f
zxG0#!pfU87#0>ir(}}8&0U`WO>zZSg3biA>3(>cQ-3(S0Zzul9yUa$-IX3sG#@uSG
z@#g;9bm=#OQ_~vU774QqTdmS`QWTMQm2xgz7o6Wkc2(C_zs}Zbc(&sMAk2|yti&p$
zvQjmD*VvB)MBXf??>>|!CkDPN@ak%uE;2mEK>T1@OWQ&vKcUYMWa|`g;|WS8Xh2-l
zoxKP4M*kwyi;Cw#idH^wp*N=N8&Caj8Kr5Uyvsr0F&9i%y-v4tP0UoW8Jj9mH#DpD
zLabSETLyjlRPVZe7-pEcHd4Tb2>JXOX*`jzg?lvMZA^KNnqRedU-PlsTHD)?sGpR`
zc86SCIHb<MABW+gJdg^@+3C{RBFAoT_HT?)Z9}dnSMcZ#5i>^TMIcTHpd^X7p`cWR
zj!u6e=IX)Ph}#MO%gga<*k!Tnq#Of7CJ|zCg=!nn6F|?LE=qjMN>|&(n!Tws>kPE%
zo`ts&mYQ_aAub&^-%ph1h`62KJ)i%*vZ8iEcIq|#6mMsDvHR&I)z*OXN=vdN`N@%W
zW6m5RQfRl6Gd))U$56Ef-<Wq}jM{1LeKDs~`)_bbSI1^Oq5HY7rMQ8?7Dn^*l$NCD
zrIl2>OI2;T^?*@(!G*vP)eHB~$cVw?TOxNdRV)1bo+UQlxIyZEKpDlHIhv>YoKJxA
zIHE63&~2h6Ec$7|!3IxRM4N7Hy5mH{g|Sp*mTv85lDojjc!f7vX?jg920F<n30|1Y
zeXKt_JX#%J1bX~)veHzJ1_;{k`DVVSHnr=c>e*VKx|^b(KD8--^F5=|!(ZZA!=Al$
z;mz8?*AJWZ+c%4_D83{X3nUeNe36^jO|D6BgEah<i0V2erGWdIns2p20C395a-=?e
zg7cg9sFk>gHoKJvCkf`)dm}!u28rY`Sk2bc(zq?_M6*x0R@IWRgciDn6BE<$(fd{1
zq`F~u@IklJydkb}N6Aabb|uR@_BKxc_l-;-YK?K0Y#sC<DM-c5zuP(F%E@5TwQzT9
zYpi}N&CK491cbwBcbmsFE4SKO+VaPXj4$U?50F;(%X`fOE_mlh5VauiBkX#59))mc
zieP1>bvhp&?`W4_8_uB@>e&kC(NyX8msV+OYn>(z94m4Kz^se6qdqAw?i(U?*|0uZ
zYI4w+Yf6kyd4`tASRj$%8NIQ5&S7Qsmhc)HZ2^x_n5x>bFMw=(I?OKk*QDDVRW{4P
z$z8OO_uMP+jinvexgN2ci-rC0H<6`LDsk~)Rn=^;?7Z~6m`-#*+B;c541ep*%#Z8p
zFHnc24+J;@T}uxDVBHhXH^2On)3jH6VQEPt(**v`v*EGgSH)x$sB6weBqZcH5ORyT
zm5zqnVn?0g2YxKmkT1*E-;h%McCD?zfehbv>>WW{-xzfWSG%=+KFZqoF#g|X+9~TL
z@(&nGwt=~Z6oYO7(+p+Z;4Bfxn-_2?ExXRo)^Td;Jcb>P0YQWBZ;e+ns+dxOdn$Sc
zeDl2Q%I%^KsO!{EO?$(^0*+y55GWd{RLN+|zwa|Dx*J3Y?qre(E&rmNrsEslWQ1y+
zl+P`ckeVWm^x@(|GnI@oOB+$C2cUJ)r4=!ibDr)D{>K>>BOazwatGs)+6Cas9ag*&
z@rQr?mPx+mWg5!mB`TGg`ji6n^R}4EOL{pR%8;0)HZ6V!49LgxPDjK1BI%uos#jJM
zrN0Gqo2HQ3H&_9TVZYD2kd|PA_cuJ#1cuxKucV6r%pp?SJjh|?c7)tO)Z`u}<fDAA
zgTEkx-{F8Xr)lYmi4|zG1h0gsw`C<3+|mjI{nQ{a3cGC<Qo(`*b3FL*4ng-Lj){!@
zztq7L#suC8+&54u16LkVfNZN_S3<7Sm~k+;bW<$UA5%u_ygLv32Uwa^x(@MMTi~Z2
zZGInfn&~?r|0CCpzTN!&Qh8bZcXgBk0px>qw?mxz%d!T&1qNg*dPV&qBfN4M9-NbH
zAwK7H4a@+w7<rDV8!2X`A$U9q1Ym7?%nKrqLX9yX1?u-K8&p@xAjn-J6np0TThIrx
z&J+g>pYqiJWnbr^NTho1LogNGCPwogzsv@UE$8d;_k(4m1U>bA{8dV+ruODH@Kn5b
zD7K=JQMraVn8?rfIxy*rU9x1S6Yqdwe7Qpq6BMffDoilM$X}2uiRU-HzOLv7gf<4u
zjGBzpH<gJcoN9pY<m(-nt9Ko=LigoOVT_eMV2U#Io|C`Om8rsnLgiw@ur0BbO2Kpf
zx6~VR%50eEx56sb_kqL=aoi(Ni$}`7CEP{JTI6@Ip0p-L7}z&^3fm}zUaY1<kgeJk
z^+xjaGL`5)&<OA2_be%20EQ`TDq08*;DeqwzPoOO;rJ-#4faqO1mQjp#a8~H0y=*G
zin?g)n{;gSGkU~5=IIYsF`*4C&|RkosP3|-V<}42fXOD}Eb;Mt%!{3ytmTPoW1>S2
zr6~4vU>=5K;X_5zfYplsGyy%Md(Sc_x}Ry{$WQVSbcO8I{|;$sYiT*8*}edQS=EAR
zqt(ylBRLdBxD%alz3z8^nZ5))p3Gc`uV3m-xZ`(oXDAJr3s+NR<^74E$>)TZBZb>%
zlTMomxXns%P`BqSbIQd#;2Q>k_)udo7mxMK=AgZg_rQ<HuIOUB9Ur$_J%xjEBdOR>
z*2mub|DX|rPTeL!rhV!zJBlBjLS<jP7^`w!{$3q!@4DQxm{|HU6~|)4hOPsXyonkJ
z1(Sijn95|cWSH}oKgihT)$eUyY>!p(%KE_e6-X5T<3dO&<Bo9x<!d|~w+;;o3E2{O
zV-z1ndv<og(9zb`c9l+0a4J0@DM{VHz<||paIa~U^*}LW@35<&!_^7{nAOkDlVim(
zWdja7WUiSJZx%Cq(xZwqaRaDpn~Ag@Ikj!N)YX@0BDA<~8<7IbNqlp2X^9^s=^T^n
zoSjL<TUuI7I)Bmmef<16>Z#R4<lwRTTDlAc`St9-+sEhh3wBq^7i4Fa)U;=K^P0zr
zz;3KqV51S+&~zi1f#vR?`Ep6?o%ooTVduGzV`F0{Fyya|JWc(kb79~zH_@pwzkbN<
zheuSM!D&ozM|-E@5#$q8RDCBtBI1ed*h&Y3JVkHpSH|$bIvDWP(m&&ROfEB<nl6K+
z4nuzekKy17C65g#L9(BpV#FuE#+y<0WMHX#J<-b_1ELD}D&{m;#s?x(Un`>XHAYh{
z0=7<%@Cm$&057L|CR(jAsB{nK2r;#zgdnEksM>OFZXbSg{8B}}pKjKx_bax~dl$`{
zkSL=Za2=#Z^+@YHFo6;M@ifnS4QNI{Vu}GU9${?zHW?}Qmya$IgPVh4n{9l555GOm
zaGAUuVqrn6>?NabX$1tO5s8tDx($3PJ?)HSA%I>Gyl3&*UyNY|ZUpn4e-wA+J@YnK
zac{+%01&kVYQBnS5G=Vc>*zm%OgcVAcpVz6j94;0pm>r}O^)^!ePkc;iskP81wj*U
zSRp+t{p(@Rli<3(QC?V(j~Gz1A_<wFn9$58V4}E#u4+J{QUEeiFi|J>0YHe##-w0F
z9jJ4{sB>31OCV*`eYQs6VFe-jHyDi^00G8AoM-NT@gaZ?{YEgM!{=)}p{Y4upW)*;
zP}W~!<UZ4b!G90V{BP$IJOnD*q=0@CiCs=>PiW(o-GY2>gVVhD2o?+5R<>U(P!>VA
z4TkL<#@_BV7SZlsT2Kl;c&*PS^^cfDZ0I$Zn?}kObx46ig8-TnxyDV2cS;Bp<Iz0=
z2xq0I4<FS%^e@K8RaNiwoX8<dEHHv?>#}}KA3}rY<dBTS84EP_7l6G+LDb100G^Pd
z?;&WE3fvh=Z&Kb5WXB4iV<R02xC_&T=O&~^3z&c&2hJ>E#;l*{83v?=nixI(iPF+j
zv?&iY2LpRSG4WPKr24Bd#vR0ep$w~XVARiRh#-g_tfddv8&mbfzq&QFp-_Gn7&fG0
zV5<{F>5Czn16Bc&`Xz@Cq1!A%QAppjPo+>OK6phD85sT}0uUCkBmkyJ<{9wpHJ>G5
z`#7T-MGS_N;*(=R6(2yQR)w5E-(xNU2w*srg_Vc{rTBugIby$Apf|w!_;it&Vu%8p
z1nLR<Fd@*Q@)ur$#xSrTq6YwRsYt=84Zko6)ATpeHQ0KlU_^SV3%t4Y`+4Uq>f}B`
z0K|N#AIBIdO<<wu39v*yOpLdTY7b}@a<A^guMko5%|u+3^a<g-zy&_7v`T?ZqyZ2D
zJ<W(~CBOs3BNUz}_GBhKvWS0T;__yD03^^Uf)P^vw=qPg%Vv@3r2#ZpkhvtV+GBh`
z^urC*yC9#OY_1L@ueB6V+#n5yvFc@?!E(qaKRoI&$|$`h_c7;V))qBqiMa?Qxl7M&
z-F-MMbVjIId42>^x8$yiD16-IH`)15e8e|?<iwl|yaM^NZ(5CQ!L5EIvJ|>uG^a!}
zEQpuH9s`jD;<vtNuQEl<BO<5GmlcgeBmcz%DN+uFS5Ik5sy$1)yIhWqzO`vAGG8py
z=jU6mU$b&pjKgb{@Qg&(Yg&wd8k#e#oTwh2&`9Y8arxC7km&Tv6DS3wK1!%-j}o6)
z1(W<*EnR-<b<=(Qa2kYDxF{<&L_ErL5NlGZxKK`}cYr2P7s^Z{izYzoP5QL&Td`}F
z7f6m0%^qwS8^P{_iV3o=sI?IbJwUWUj;uFMFvDmEvWO;voTR!+?EE6R2UEuZ^Xg;d
z<5OR~ZTUW0-iPGZOP7xa^3>$lW7!KU5Be<#&<>4nF|)RRwu6b$&<#^Kmu)u+@Bmx3
zPPEU?C<p6l{DcxJdJCk}hXY6_cOVcm7ammiP|_Lm8Aw$I&?IPyt`D>5I=D^Le5G9-
zI}l?o2FR4p@KA8+^9&PeC+fS<>Syrm3ub;o=;3>CR0Q2i#8E7JFpXKn3#7Qw0EpkT
z%^eZxC~Q4-)cgEz$rLjpFi?CGFs^?tiM><*26iEBN-Tr}4F;ncm7GvlVQ4Lh!0>ru
z2g+6<pTwcqX!zB_249|a`6)~vMu2&cT4dkn()ZcwP&P`Q(`te4B?_IE7J*@*5EzR1
zCc?$+LV6RjM-52o`w%3Wg95g$RB*(EI>-G^4(rdomUuwtWKifgMg;~GgC39sqvLI$
zcO58rn3Mpjcn@`?DG82zQAds`Ly!*<7^^53N+5zj&lPTgr(d8hz6S3@DQb#}!)=HN
zH7W%ja4i7U*D<U_7G{wMan@>bG>P&_49zqtS3;|5kqBz8v^mhKDhfw6^)DfzBKcEh
z`_(Os6cmU7=$0$UGla%yP18XL=??X+SpUcpmZ35qHxx42vhedL=$Vr%u%i~_x>>tA
z=s_fEZm1~Uk}Ghb&JL_PpaxlvGG(H_S<s*l&47_9P=L$OrC0*?Flsh@FrM^bR)nCA
z^uZA~YD7L9U~Fg+YOK9(ih&lPMrG7hf7I1mZxXOAalsrwr2}RlnuKaE+ej**H+pok
zMgoJ#58Nag_mh|4VXHdepOAtOWA~b*nrn#D7`7fG7#_s(lp)`IS!1>-zO%%qbpTLT
zpT%3TiQuXf&bW0w?uj|3Cl|;A1@jogSm^01j(oO69;(*bA?T%5^U9@)!Q2(GN>z{p
zPg4w!<5%X(t5-i|Y(xjqh;Z%hgkuSeOjWtoO7~+RB?wfRbm=eFQMgF>O4mY6T>M8{
zufW;SW>mL;DM+%(-sVy(^$Hx6z<{=CPpOOd0V&LExZ{50<zlIo0K96qtg8>*4o+o_
zrNKNVJK4e2t)slu8}ANRnu}RlCXoDSZq{~NS&e-_Pd}o_?j?L&ldjX{Frzb0##hLy
z_fIZu<Gbj=<4?}{hDF7O92U{W+C;~W(R{~a`A%#H*y_&CJRo%<V+pDW#Xn|cz2O_q
zNOYJkE&;+X(f7{~fJx;v$1mQFZDu;25m<;<Yt3i7P+035g2F^FzJJiU^nfP1(en#@
zX3ZNA#5i!0UO=m9#tA9Pa*0&{0tbQt*Ts1S1b7I|d0m(4K!iMmMx@?sy)UYll3AM?
zI5hacv7NOOO~6(Z>%1diHbJE;JNL^t7tpt55U~_-joP^=x$b-B=DwIUupMrL6MS<Q
zF`8G84ZO7slXqqIQgU)CH8}Bez=id5xj|!ap%ftT3}w|X_9ie<UVayPcAx<2sz>@8
z{3q}v4U>}^0CJD>_?+NnK)Upz-+&II)X(P!>L>tpwCjR<6K#bdl#k`3j#8vCk_;1!
zAruA_@yYaJW}swVFEAz!dj?gk?}IQfCx3*Nby-7m@)uW8l1m2E2OW|ZrM@QD%k=u1
z9s#PQtgBWyC`S<T=>iTATH&*Zw;-%5sAmp9J7|sf`Mg8)2<4YSzzQT9FIr`f`)Cme
zEad^e9csk<($fvhR^F^1h#~-v0~PTA^y=yU!G`MLANmE}bOY6-d+O|U=@6C;^bdg+
zejM8?c_?l0G@!;B6GA;~i#Ih$euy4=>KwOT5hqH2s0frwizv}YMBs=Ree{=7se%Bi
zPr-rN>L8#1nwM-JW>H4LITRD;&xCq?Bc)8QxHf92bF3LA-|nKGOhMg`iMs!0<1mXQ
zE$S%kW48JjdW4TYfFl-EyDmH#Ci-ZEY`ibi`(p$JwLZ5%%Lp`I-h0GF0Q_yBYS}w|
z+2bCr_4%6DsQ}9ct=#~9BS#;7a*B$qOuwpO$2u5UItXrt;GB=y*Fl7)uhb_UMIs@V
zm)_)2uIg!|sF(ihh70Mzt&i!F<f|x%fmnJv_;K)GUpZ^Z0qr#IDr&U*NL<rHF>DCR
z1&bAhVph=VU6#9wGb73V54!K7DPiO7s$^Cc6wpC=0W1UrDBy&02m4D})KN|WaPTQv
zON;POZ&!d=OJATTRpFWy&I^JEzLu3UzM$9+ME$<qYq!`6`Sb}PG<Qba1uxL5?F1>#
z1Cb*85R5_G(?4V@-wn*s;(&>dveIs!&Oe9T50L}EGoK54+3B^%0oTcSm9G)@W?pKs
zd&OfTjiXt!e6HaR5N#~RZu4bTYY0baOzZy=zI+@cyIFdS?ZpoN#e3afJBU98F~xr-
zYmNlK$yxP}Qjd}79KrN8F#PRDLPN7T_*A~1g@dD*zx{t8ZhU7abL-@;)TJ)&#;OZ-
z6+S3V@)%7|8ufA6Zq}X{&W;tU4YjJ49UT+R)D@>1dkts?`Fc+SkOQ&z%}r~Nmx>u<
zQ5W<h0hO+HS@8M!%@L5IqOLvM0ugvcv5JVc{=0>SQU5LOy)Vu*-}v_Xyv19SAdao!
zdqI>s?qv<iYaZ25cmxZH&$Y1ac)hA`+rb5rrEMZDot=yq>tJzJB)VB<I+xz(m6qBM
zrF;lC?E|Th<CUEuP|ILBURO-={XZZPB=<c!hz)I~8z@OBD@~fCE&@VEa<#QrbI|Mf
znBX_y%>X=i;8)mmD%{3%98vM#aMPD(lGirZ5s|6So8=pB@fvhg>aOS2|7Rq%$RMaK
zFsdMq%N10wVBjkg1J*~I+w%i2b78?K-J<BKjxdJ!M}~R25fMkVXGRTmGb1AZYCljR
zQS0jB^#xT*o7ly5K>WtF04s-lgeR`0pzJp^{Ei!I1+9v3JZg#-bDU6=#`|=I8q)zc
z0pQm+V{#M>Ldu_`)X~bTv0B;l;~V<0rc+&ICX}BrCZ*gR4GL}(N~IGgkt`OoD70ww
zT47Kv$)o-+91kj@Jq^5Bl=tq>rP}X_T8<+e3R!p4DA3#mDk_TD)MvJOBx%8eVs}um
zf3|up8YZ0NE^+jX)XzjwT1q4)(hZm`S}Upiy_{LpRnR1<@!d^J&O3cMz@Y`PFscj_
z1Qz<}?hJ+rF1lxL2TahU37Y%)h7(PRAX|sBTPPP8&{4dJYsw&e2SM97Ryb>DKyV4<
z#v(({$jbL#Q8H8`5YA5%y)*RDhXE74-{_+@aIO@6v?Ut<&I%0^b@5HySikZ>AQe26
zVPHOjX4Oznqd?;yO4MlCHhFD#@u0#@A{hPN(A%e0I7N^DOojq8W4Dxjv0XuArpe2i
zRdkTr=gGik9NhX*iZgdT+Cs2AMftBoWT^f_IVk=PoVvfS0ROdamRbAHujc~Q<}j=;
zCgb~!yC4z_MMVsH!aP7c@BK&#dajFZWSFp`Tmj%FgDAvq1(JnDeMAx6`Bmw&;Hx)E
z@+45p59PgMu|RnbAowu=fN;DFEk|m^tfl1W4iqtIn6RSQACv-3Ttl;YDCHVBLXR=!
zCOATiR?4Fi%QbY15t7}OXf_U6VzQQ^nL9LwuVK>r-6#|>;vQN^|5KHe*I~g0aRD%=
z9@pbita>8Sf=9i^ARmd_um`#V9o+4X>rij9@Bb&BH*qEzl6?E*xbq#gh4ob3bow%M
z`Q0B>kr|GFaX>lb{A@1-U{aH(^|(ZROBw_>ZrtFuo*9QZSAxwISj>?YgIP^+2bw+n
z;O0i6pO4SvxG9XbW{5+A0*9^~OA~WpNEGV8%+QS;zKc@-9bw7(=pi?^9Nzr=Je$z|
ze$Ne3Qi~A`v$y{<*ZY&cVAVHlH$Zu@v{LF$;4sR~vwQcj%eowBZm8qGhxIsfKjEfH
zFa{-~_#;@X&k3XCkDUG^D~HYdCzeHigTGJSl&xa*7jwn<V@mStW~kQw8{QZ12*&4g
z@vT=hxffggAA!1~hr2t#?(kSmkC(U_(miY6^kk5YH(UTWL$#7o@C>6+l;>MWZwm^O
zL7~bX{0i97B0V8uV5@6WNX2#AYqz)Y`qPFB4R5o2XBzO7USU;ie$W4=1B+s>AhUSr
zkV>m#m1aB=r|TSV0YdRmqX2*+{yQj}>!a~E6j7y(`=BMx06@{Hb9uQ9T0q@XlC==R
z-8WET`^T;Mn@ly*tgsvC9d+ME%io3{imbPbe)~Ka{fB5Y8YtET<ZsRO9`cBFe?h(J
zX%Wm-9sSN}DYN$@!har&r8fO@GO=`&MH;1kMcz)a##d1)Pt=4gVL>To<Es@@_1Eox
zZp0eUl~t)%k3#i!P7Sd0amjd)L;!bP9~X;yfC69Mhvk2_il}j2cRG`+=UQAJJ}+7X
zNoqoxQebKLgC_qfRC5N_2Nd8LdwBmd(IDXK{t)odcdvEnZNs}>r%j=U)Uly452+b1
zDY=}aYKTU(&{IPh85%QIew0Ft`I%-GX0{RHi%WRMxUfc;XbAQ&J$bvf!69;u|E-?A
ze@VJr0>6dFWnC;ONEtd=&PsXt6EvHDa;JFK`TSQi^7{+D%0k?LnQ8_<+rAi%C&yX)
z?8n18&ODVNw?~S9WXYl<ibbz@fTbjJGu2BjEHT>2B3CO0h5BHF&>)}5?w47d!;>8I
zMf^=`Yu0#Nc$Uiev6m-^k&8RC-x%M!x-9cuk8wUK|FFoI-wH>w0<m*=n4{eAB|RX9
z{_oL5L~MW}=6Z&OE=NQFsHejX&~m?dk|rM1wwU9&f01<+WJ$S8W<T->;jpJlY;cBg
zRUX@PZ>$l<YjRsye$!XKhF*l{CuD39{S5sJw_vR;ElB%~;TbH+UpfuL!gqc(HG?{7
zp)(>Yol=+PE^H9XPUds|7Ix95O*vh(w&(k+{_zja=rYN!5>P3a%<G6=V@QclK;V+w
z@I1S8xqI}@6YJeP5C`|%T^JE=gmaaWe0N{&NA2BFyQ2##Q8omQL&7ee-@i|PNb_i_
zxA)+5cBuU53=kx+>QZD~<Q-dv^YiIoOCt~%Co|0R(Q`ibg?%I;GlkzwwPuF$RfMTM
zs3z+Q)Aou<V0p}I{HR3a!KY6zJ#m9S1_dPwIRai&+E=2bL|$NR>^@YCMq<70jD8&w
zyT#KX(|*&7iWQZW=|ih{=`#T!3Ba49d$vw4yTqn;I;Xz?>V*I|$<wTja}bp}J9zN#
zd2576yfN+U^+E$2s$ls_x(u2>J%tx4dCnb)yw0B7Pd3bqVog}xT*|O-^szyzf?{V!
z--#!eWzz62GB*dr8;?s6A_KfRFgjnS!aNb+%lU%3{yg!B^^_afjpk!lo_1hB!J<RA
zz~X*Xw|ydN<WSr+`GEHU0|T?Ruk7~jRT6Q*tHi~~?B=_7_kejt{>iJL;)Vbteg0%o
z3cOBFUT5xz25`0ap%ffaOrzDbGBy9RXof&BS6kF6J4?{wRTocaf>vKh#=zo_?m|lA
z%74_g@mfXrAB}$$62@ZD>)4Jqnvddl;70kV7%qsJg_h>kp;F8H{ruQy<Or3TknRC&
zeck9!yMt09)f#mF4N^g7>9CLQ$fCH=R<;qdEIikdeA%wz2OsG399~{s6$=P$|06)e
zLXXbhpb5x7$JhSNIr=}o1Zt3G>Kto5GiTy<9nHPQ?1n0)PjMYKIT}?-A6_u4S2r9m
z15d%H*G^@v<?5$T+U~?TN~LS4Vzb(tQ~9HJK(P&ou{aQ+V#lBy@Ut===(J|W*q4o7
z%}N{pnOkGaxq$5FV5{V5r*;8PB!USC+>w*27Ah+x<z#0!I74c;ScB9T368&^P@s}Z
zPVKbGr<XqnZc4%YTuy%c55+wIWgzFR-Jp7Wz(YZKIR?Ab`#MOAwONff)SANd*P||V
z#FbdIb9Z;Iz8D?F1(@xQ$`9G!Z8b4UOic8S=RgjR?=4z9xy|FM0(<~rS363<!RG0F
z$3Xh>T-?WuU`e4^15~+ytj}ef$nwicP;EMpA}9O1eZB0iS`-|)4(@;g73@y(LAkD<
zRyA^ed`}(5T0`bm={dPHhy*28sG_s~A!dxPf_g*z_k}N(7yNKsugf7&dI4%X$}Q(*
zoOyVFiW~K-e6!59uE+V{F9Npy{><w%lp@c32h!e<%^yWTl?>`}|6Nx0|G7B)-5{0F
zzgDIl^f!2pHkeomSagk^WVJ2}9MP0{VSKKoCI7~#n+Y5zDGGzvpfC#Ts+8UN>=<zV
z=y0)HRVvz0sM31&qON<im3cT@jq^=U%PQDJAf+0yc}+(j<VoF$kCDE1<;<hjB@V3`
z*xk}1jM^I1^WRO+M?ZV&?tl3*y;<mv)4q9!LdaKSRHgPOsJ2Mtu@?tPieQAM`sFb2
zAT4;TDY?!kNpUM3&PqHBy`DbZnkx0W)%ACYS$b|!&h#0PYy!V~53#Qt)tDBTVA9M!
zChf3;Yd0TvG1w#-o^iA4KC#%V5MmxZ&v%em?VBSr*+0$>3rVILD>j+%du7Pb%WEAn
zi-m)W>dJ4J%EsCE=)p1S`5IH(jbPgW>DA>4a|(V*;u8Gb{hr6pr&WUwp3rOpo5$6+
z6@K(9og_W5BnK2`*H+hz<Y}m{iszaCSL3)O+6)6#_3PwfpfvlK>@Z(Ff3SR=C$iuL
z2H5C9$+NXFTdN87s)0H<F0*01-cwd73gZV_`&4`18K{nHmhsTDG*b8Tx{=k7|IB4O
z(2bCikzI6`@y!x%raI869Gp6O*mjf$gvm1vztJj~kem&P+g^iPfIcRjmgg3Z7je<k
zAA%x#$>78s&DxqlGRC=o4JhOYA+6mR*xZ7f!wTJlod&|a3-VpR-Rodt2X(w#DPdL`
zUK?A>qNDr8upaZ#jB5F$DU^hOk5AAuFl>!GzuAjxrBq=9VaM~HYJy0%F1H4-&#7hA
zzLj|-pK`dux;F>ZzAM+<#cuhx%sN^A3k)#g&-ZkXsMfw`Uf5gp5>Sh&=mTh~$Gzrk
zZ%=)2Q7JWQAhgVOC0W?L!M<HSlC5O|>^~=<t8qrM$Xjo{$KG{3t?O;@y?m%05qbaq
z{j#&~rhD|XI{ei~bHU}yY&uDY>mKQWiaxcq@g(968Ilqb-J@=}O%nYByc>@F-`L#v
zMMl9i^JNaL|M=@ajqv3<dtV>Xe))eEhV9Z^KISS&D(|m+WYL}2J;^NTbfRxp(lNL2
z@G`c}T|{udubxe}Y|qvi{I{4{Xogj8ZVZ(M4mzr9{v(Y0A6fY&h0?ED`wK!ub_1kD
z9@F*-fl0{zaxQiUqMcr&_^)5T;t)|%pDe5bKF4Wn_o{{}_cRZ|@U2NQ-Fx<6UA@#T
z#aHX6KM5%Bj;+;O1>1>1tplq@O^G$ad<%KAHguN329*C2MCW}83Iv*6tiMN8?b-G1
ze~b`EMA{D6_XH-{4zDNqYVBV`YH99?FN4yq1UdG6{J%@h&Wrxeg(WJ@*3#<KixD7%
zK$_pNQADsesq#3PG?U_z`!&#q1+dMWwoPWD$~e<vsi*h-Z6cbVlW)CJK?U*>s3a@Y
zs-OQ;Nf!3>%-zl8RZZ5u@ismwDe2ktHb?Ctb=TVJ3D^gk?0x#l4m*Rw7t1*`a7Ms8
ze{4sBJ-t}qhhzU7&{v?f{pD~rhhQ4C>!65=G5<@x>i?}h{eOVSI1cMVZ(8KTSsZUd
zcZ7JEwL$bzN5)cX2jB6UFuXxFFj>;ebkS|_I|wQ^Y?0mMNJ}2{2NwD5pYS?Jsd=BD
zjDlL<!X%+A2fA$FtouZOC~D(7kCBsguo=d{!0?EMSFyyU@v{UBWP`mx$V&Vp*y<d`
zs%_8`>a|6|^w4hw_~T%sU~0pq%4ybnY!6Ty?FqI_q@|^01W{h2okD%QC4-jyXm=5O
zeFC6*_=7-6uyYW6W8$DI%o9}l^Dk2KIyn3o9tQ<Fec_X5LnFO4o+sle^M;2TA}oGt
zMMmi)21gl@68IFL<4Kq8pC5Fq28g&E>%2SHhFM;k<z`&o(v#^4`<?mqM!2vQ+)@EX
zbL;U>37kgZw~knIYAX2`wdRYv?MF5`#Ve|GE&cak1cbu3Z+kd^4Hd$!_Rd>E59v4t
zeo9*9kgL7Dtbh|N%jneWcgGRNE4jRF4hTwove(jCnj6(KU4fJ*CO%4ZCtC5g1T%eP
z(_k_Ey>s>6WK>`B&RwR{uJU!*xnLcrrB4(ni~P(2KEgp_aJqrbJZhOBY_?yWD$stu
zJ?)nbL)IH-p7Vp#EUyIhW=CL4a<ni|iB3RXD09X@2w8&Uy^mqGPPrm5F`%m1;ShX#
z<Tjr}WAD$&(Zbom%Pbs6iwg7W3-j|jHE9_jtqi^z!*%=%hd9vv#O_3w66at`{T%BO
zd?1AI)^@r4jNmEl+3`+zG`lWWmS%G=yj;{PS<KTNY;`{%c0X7<m^zULd){FVFM~iJ
z6jdPqX4Y*nE@&46R8=>u4btUinxf9GP0WLxcCn|~7*P2)khVA6IbzPc*aqy3G~?Xu
zbgDH|K3RI}RuwG`*zhjDoSNo^^%7tM*xN_gP}S7DIYST_62hK+HT2N5C+6ERuEc>z
z1^Cnu_+EwCK*2ir9Ex(ba)XARgz9L|X@EBD)@@3PBZ(tn@sA5eVE`Wz63%s+&UY~$
zNMGrg1U<V;BltNuxbtgkvtAQ~4&((t2=AU;*RiwPjlNvh1@)saRaWo`5P_#3t|5)h
z>2I5}+S)x{Z6&0kUo>RMU%uARIRbXZgZ&|(fLh39VXVc>{zEWTr33QIcZr!%J{CsC
zmw3fi7s;pcorQ*q`A~`%(0p61>%_GF{*n&s!@}MTb$xw(AYTSzMJ!%XNBB$5Z{PBH
zua@jT;^1K9U5IPC2R>*3w)~t~B^OrYYsN7+7reH(Bu6FCq~~A%Qf(Q9bL}-?U8XtI
z>`}0FPcF0JEO#P$9azu>i*r?WU<YOt+#WswGyMHaykO?${zc3_Zy;DEP!LZqn@CD!
zcQC$30ZO{@LtIk4$G`_N0)nZW!Ive#o_*l-5{9^_W$9tiJ$vWj>ie_+d=jX$@bGms
z(U3|1Y?Z5|gG0|n(MPZ+sxPKg!7I_8!gskx_XZi6=SE8`AB=r<SE6?R=3gLF3o3Sa
z*+48vUC9f4ucC90oUJ>N*>su6>Z#4FYzQggNrf}ei1q%<<@xznl`3%X=@ej12Kayr
z+6%T!oDtJFt{+l^PnaMkpV-W1RFiyH3aGfi3CUBae+Bm24en}!4J5s*k%nyv{$P^}
zK!REz*0cL7EHJrvdEo|Ij<k^<Dxa=?veM*642InyXtaL&^a@mz1AOorF{{aP>yi2)
zQ*!T}etvK5oM(;Tqdw~Akr<6QsI2$uW=C;2z!##^A{_(8UsKr9&G%@YoMa_|<gL#d
zh=|^qj%3!hTi(P9(scS5<ar|IGutI%vb@)s>+lLnar+zY)j+kxzlBx*C5rT~5!e6Y
bOBb}w9_0f@gzulAUh`5y?pcAj-uwRr+*T%z

diff --git a/qwen3_moe_mast_20steps_loss_curve.svg b/qwen3_moe_mast_20steps_loss_curve.svg
deleted file mode 100644
index 7fc6c0ca..00000000
--- a/qwen3_moe_mast_20steps_loss_curve.svg
+++ /dev/null
@@ -1,68 +0,0 @@
-<svg xmlns="http://www.w3.org/2000/svg" width="920" height="560" viewBox="0 0 920 560">
-<rect width="100%" height="100%" fill="white"/>
-<style>text{font-family:Arial,Helvetica,sans-serif;fill:#1f2937}.tick{font-size:12px}.label{font-size:15px}.title{font-size:22px;font-weight:700}.grid{stroke:#d1d5db;stroke-width:1;stroke-dasharray:4 5}.axis{stroke:#111827;stroke-width:1.5}.line{fill:none;stroke:#1f77b4;stroke-width:3;stroke-linejoin:round;stroke-linecap:round}.point{fill:#1f77b4;stroke:white;stroke-width:2}</style>
-<text class="title" x="460.0" y="34" text-anchor="middle">Qwen3 MoE 30B-A3B MAST Training Loss</text>
-<line class="grid" x1="88" y1="482.00" x2="884" y2="482.00"/>
-<text class="tick" x="76" y="486.00" text-anchor="end">9.5</text>
-<line class="grid" x1="88" y1="412.00" x2="884" y2="412.00"/>
-<text class="tick" x="76" y="416.00" text-anchor="end">10.0</text>
-<line class="grid" x1="88" y1="342.00" x2="884" y2="342.00"/>
-<text class="tick" x="76" y="346.00" text-anchor="end">10.5</text>
-<line class="grid" x1="88" y1="272.00" x2="884" y2="272.00"/>
-<text class="tick" x="76" y="276.00" text-anchor="end">11.0</text>
-<line class="grid" x1="88" y1="202.00" x2="884" y2="202.00"/>
-<text class="tick" x="76" y="206.00" text-anchor="end">11.5</text>
-<line class="grid" x1="88" y1="132.00" x2="884" y2="132.00"/>
-<text class="tick" x="76" y="136.00" text-anchor="end">12.0</text>
-<line class="grid" x1="88" y1="62.00" x2="884" y2="62.00"/>
-<text class="tick" x="76" y="66.00" text-anchor="end">12.5</text>
-<line class="grid" x1="88.00" y1="62" x2="88.00" y2="482"/>
-<text class="tick" x="88.00" y="506" text-anchor="middle">1</text>
-<line class="grid" x1="129.89" y1="62" x2="129.89" y2="482"/>
-<text class="tick" x="129.89" y="506" text-anchor="middle">2</text>
-<line class="grid" x1="213.68" y1="62" x2="213.68" y2="482"/>
-<text class="tick" x="213.68" y="506" text-anchor="middle">4</text>
-<line class="grid" x1="297.47" y1="62" x2="297.47" y2="482"/>
-<text class="tick" x="297.47" y="506" text-anchor="middle">6</text>
-<line class="grid" x1="381.26" y1="62" x2="381.26" y2="482"/>
-<text class="tick" x="381.26" y="506" text-anchor="middle">8</text>
-<line class="grid" x1="465.05" y1="62" x2="465.05" y2="482"/>
-<text class="tick" x="465.05" y="506" text-anchor="middle">10</text>
-<line class="grid" x1="548.84" y1="62" x2="548.84" y2="482"/>
-<text class="tick" x="548.84" y="506" text-anchor="middle">12</text>
-<line class="grid" x1="632.63" y1="62" x2="632.63" y2="482"/>
-<text class="tick" x="632.63" y="506" text-anchor="middle">14</text>
-<line class="grid" x1="716.42" y1="62" x2="716.42" y2="482"/>
-<text class="tick" x="716.42" y="506" text-anchor="middle">16</text>
-<line class="grid" x1="800.21" y1="62" x2="800.21" y2="482"/>
-<text class="tick" x="800.21" y="506" text-anchor="middle">18</text>
-<line class="grid" x1="884.00" y1="62" x2="884.00" y2="482"/>
-<text class="tick" x="884.00" y="506" text-anchor="middle">20</text>
-<line class="axis" x1="88" y1="482" x2="884" y2="482"/>
-<line class="axis" x1="88" y1="62" x2="88" y2="482"/>
-<text class="label" x="460.0" y="536" text-anchor="middle">Training step</text>
-<text class="label" transform="translate(24 280.0) rotate(-90)" text-anchor="middle">Loss</text>
-<polyline class="line" points="88.00,79.02 129.89,81.14 171.79,85.61 213.68,92.24 255.58,101.13 297.47,112.38 339.37,125.14 381.26,140.13 423.16,157.33 465.05,176.44 506.95,197.38 548.84,219.78 590.74,244.26 632.63,268.22 674.53,297.78 716.42,325.93 758.32,358.37 800.21,389.95 842.11,422.79 884.00,459.42"/>
-<circle class="point" cx="88.00" cy="79.02" r="4.2"><title>step 1: 12.37845</title></circle>
-<circle class="point" cx="129.89" cy="81.14" r="4.2"><title>step 2: 12.36325</title></circle>
-<circle class="point" cx="171.79" cy="85.61" r="4.2"><title>step 3: 12.33137</title></circle>
-<circle class="point" cx="213.68" cy="92.24" r="4.2"><title>step 4: 12.28397</title></circle>
-<circle class="point" cx="255.58" cy="101.13" r="4.2"><title>step 5: 12.22048</title></circle>
-<circle class="point" cx="297.47" cy="112.38" r="4.2"><title>step 6: 12.14017</title></circle>
-<circle class="point" cx="339.37" cy="125.14" r="4.2"><title>step 7: 12.04897</title></circle>
-<circle class="point" cx="381.26" cy="140.13" r="4.2"><title>step 8: 11.94193</title></circle>
-<circle class="point" cx="423.16" cy="157.33" r="4.2"><title>step 9: 11.81908</title></circle>
-<circle class="point" cx="465.05" cy="176.44" r="4.2"><title>step 10: 11.68259</title></circle>
-<circle class="point" cx="506.95" cy="197.38" r="4.2"><title>step 11: 11.53297</title></circle>
-<circle class="point" cx="548.84" cy="219.78" r="4.2"><title>step 12: 11.37303</title></circle>
-<circle class="point" cx="590.74" cy="244.26" r="4.2"><title>step 13: 11.19815</title></circle>
-<circle class="point" cx="632.63" cy="268.22" r="4.2"><title>step 14: 11.02700</title></circle>
-<circle class="point" cx="674.53" cy="297.78" r="4.2"><title>step 15: 10.81583</title></circle>
-<circle class="point" cx="716.42" cy="325.93" r="4.2"><title>step 16: 10.61479</title></circle>
-<circle class="point" cx="758.32" cy="358.37" r="4.2"><title>step 17: 10.38304</title></circle>
-<circle class="point" cx="800.21" cy="389.95" r="4.2"><title>step 18: 10.15753</title></circle>
-<circle class="point" cx="842.11" cy="422.79" r="4.2"><title>step 19: 9.92291</title></circle>
-<circle class="point" cx="884.00" cy="459.42" r="4.2"><title>step 20: 9.66127</title></circle>
-<text class="tick" x="98.00" y="69.02" text-anchor="start">12.37845</text>
-<text class="tick" x="874.00" y="449.42" text-anchor="end">9.66127</text>
-</svg>
\ No newline at end of file
diff --git a/qwen3_moe_mast_20steps_losses.csv b/qwen3_moe_mast_20steps_losses.csv
deleted file mode 100644
index cf58cdd8..00000000
--- a/qwen3_moe_mast_20steps_losses.csv
+++ /dev/null
@@ -1,21 +0,0 @@
-step,loss
-1,12.37845
-2,12.36325
-3,12.33137
-4,12.28397
-5,12.22048
-6,12.14017
-7,12.04897
-8,11.94193
-9,11.81908
-10,11.68259
-11,11.53297
-12,11.37303
-13,11.19815
-14,11.02700
-15,10.81583
-16,10.61479
-17,10.38304
-18,10.15753
-19,9.92291
-20,9.66127

From a7ea958fab5b01772bbe6f098ce50ddf4dc856a2 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Fri, 5 Jun 2026 20:35:55 -0700
Subject: [PATCH 23/27] Add LP-relaxation solver, optimality-check gap logging,
 step-time profiling

Adds solver="lp" (use the empirically-integral LP relaxation directly,
skipping branch-and-bound) and an optimality_check option that solves the
LP lower bound and logs the certified gap of the achieved objective. The
sanity script now reports steady-state per-step latency. Tests cover the
LP solver matching the ILP optimum and the gap logging.

Authored with Claude.
---
 autoparallel/api.py                  | 119 ++++++++++++++++++++++-----
 autoparallel/approximate_sharding.py | 109 ++++++++++++++++--------
 examples/_sanity_llama3.py           |  20 +++--
 tests/test_approximate_sharding.py   |  39 +++++++++
 4 files changed, 225 insertions(+), 62 deletions(-)

diff --git a/autoparallel/api.py b/autoparallel/api.py
index f602a967..ef664dc4 100644
--- a/autoparallel/api.py
+++ b/autoparallel/api.py
@@ -194,6 +194,11 @@ class AutoParallel:
         The meta model is moved to a fake device based on mesh.device_type.
     """
 
+    # Selectable solvers. "ilp": exact PuLP/CBC. "approx": heuristic TRW-S
+    # (light build, no PuLP). "lp": LP relaxation used directly as the solve
+    # (empirically integral for this problem, so much cheaper than CBC).
+    SOLVER_CHOICES = ("ilp", "approx", "lp")
+
     def __init__(
         self,
         model,
@@ -207,11 +212,16 @@ def __init__(
         solver: str = "ilp",
     ):
         self.stack = ExitStack()
+        # The solver chosen here decides how the optimizer is built: "ilp"/"lp"
+        # build the full PuLP problem (CBC exact solve / LP relaxation solve);
         # "approx" builds a lighter optimizer (no PuLP variables/constraints),
-        # which is much faster to construct; optimize_placement(solver="approx")
-        # then solves it heuristically. "ilp" builds the full PuLP problem.
-        if solver not in ("ilp", "approx"):
-            raise ValueError(f"Unknown solver={solver!r}; expected 'ilp' or 'approx'")
+        # much faster to construct, solved heuristically. optimize_placement(
+        # solver=...) may override the solve as long as it is compatible with
+        # this build.
+        if solver not in self.SOLVER_CHOICES:
+            raise ValueError(
+                f"Unknown solver={solver!r}; expected one of {self.SOLVER_CHOICES}"
+            )
         self.solver = solver
         self.fake_mode = (
             FakeTensorMode()
@@ -289,7 +299,7 @@ def __enter__(self):
                 self.mesh,
                 force_grad_reduce_in_higher_precision,
                 repeated_subgraphs=self.repeated_subgraphs,
-                build_pulp=self.solver != "approx",
+                build_pulp=self.solver in ("ilp", "lp"),
             )
 
             self.sharding_optimizer = sharding_optimizer
@@ -526,39 +536,73 @@ def propagate_annotations(self, verbose=True, aggressive=False, method="fix"):
             )
         return self.propagation_result
 
-    def optimize_placement(self, verbose=True, solver=None, approximate_options=None):
+    def optimize_placement(
+        self,
+        verbose=True,
+        solver=None,
+        approximate_options=None,
+        optimality_check=False,
+    ):
         """Solve for the optimal placement.
 
-        solver="ilp" uses the exact PuLP/CBC solver. solver="approx" uses the
-        heuristic ApproximateShardingSolver, which trades a small objective gap
-        for a much faster solve. approximate_options is forwarded as kwargs to
-        the approximate solver (e.g. candidate_limit, max_sweeps). Defaults to the
-        solver chosen at AutoParallel construction; note an optimizer built with
-        solver="approx" has no PuLP problem and cannot run the ILP.
+        solver selects how the placement is solved (defaults to the solver chosen
+        at AutoParallel construction):
+          - "ilp":    exact PuLP/CBC solve.
+          - "approx": heuristic TRW-S ApproximateShardingSolver — trades a small
+            objective gap for a much faster solve.
+          - "lp":     solve the LP relaxation and use it directly. This problem is
+            empirically integral, so the relaxation optimum equals the ILP optimum
+            while skipping branch-and-bound; raises if it comes out fractional.
+        approximate_options is forwarded as kwargs to the approximate solver
+        (e.g. candidate_limit, max_sweeps). The requested solver must be
+        compatible with how the optimizer was built: "ilp"/"lp" need a PuLP
+        problem (build with solver="ilp" or "lp").
+
+        optimality_check: after solving, solve the LP relaxation as a lower bound
+        and log the certified gap of the achieved objective from the optimum.
+        Requires a PuLP problem (i.e. an "ilp"/"lp" build).
         """
         self._assert_entered()
         if solver is None:
             solver = self.solver
 
+        opt = self.sharding_optimizer
         if solver in ("approx", "approximate"):
             from .approximate_sharding import ApproximateShardingSolver
 
-            approx = ApproximateShardingSolver(
-                self.sharding_optimizer, **(approximate_options or {})
-            )
+            approx = ApproximateShardingSolver(opt, **(approximate_options or {}))
             self.sharding_placement = approx.get_solution(verbose=verbose)
         elif solver == "ilp":
-            if self.sharding_optimizer.prob is None:
+            if opt.prob is None:
                 raise RuntimeError(
                     "solver='ilp' requires a PuLP problem, but this AutoParallel "
-                    "was constructed with solver='approx' (no PuLP built). "
+                    "was constructed without one (e.g. solver='approx'). "
                     "Construct with solver='ilp' to use the exact solver."
                 )
-            self.sharding_placement = self.sharding_optimizer.get_solution(
-                verbose=False
-            )
+            self.sharding_placement = opt.get_solution(verbose=False)
+        elif solver in ("lp", "lp_relax", "lp_relaxation"):
+            if opt.prob is None:
+                raise RuntimeError(
+                    "solver='lp' requires a PuLP problem, but this AutoParallel "
+                    "was constructed without one (e.g. solver='approx'). "
+                    "Construct with solver='lp' or 'ilp' to use the LP solver."
+                )
+            opt._set_objective()
+            res = opt.solve_lp_relaxation(verbose=verbose, extract=True)
+            if res["solution"] is None:
+                raise RuntimeError(
+                    "solver='lp' requires an integral LP relaxation, but it came "
+                    f"out fractional ({res['n_fractional']}/{res['n_vars']} "
+                    "variables). Use solver='ilp' for an exact integral solve."
+                )
+            self.sharding_placement = res["solution"]
         else:
-            raise ValueError(f"Unknown solver={solver!r}; expected 'ilp' or 'approx'")
+            raise ValueError(
+                f"Unknown solver={solver!r}; expected one of {self.SOLVER_CHOICES}"
+            )
+
+        if optimality_check:
+            self._log_optimality_check(solver, verbose=verbose)
 
         if verbose:
             logger.info(self.sharding_optimizer.get_log(verbose=True))
@@ -589,6 +633,39 @@ def optimize_placement(self, verbose=True, solver=None, approximate_options=None
 
         return self.sharding_placement
 
+    def _log_optimality_check(self, solver, verbose=False):
+        """Solve the LP relaxation as a lower bound and log the certified gap of
+        the achieved objective from the optimum. Needs a PuLP problem."""
+        import pulp
+
+        opt = self.sharding_optimizer
+        if opt.prob is None:
+            logger.warning(
+                "optimality_check skipped: solver=%r build has no PuLP problem; "
+                "construct with solver='ilp' or 'lp' to enable it.",
+                self.solver,
+            )
+            return
+        achieved = opt._safe_float(pulp.value(opt.prob.objective))
+        lb_res = opt.get_lower_bound(verbose=verbose)
+        lb = lb_res.objective
+        if not lb or lb <= 0 or achieved is None:
+            logger.warning(
+                "optimality_check inconclusive: lower_bound=%s achieved=%s",
+                lb,
+                achieved,
+            )
+            return
+        gap = (achieved - lb) / lb
+        logger.info(
+            "optimality check (solver=%s): objective=%.4f LP lower bound=%.4f "
+            "=> within %.2f%% of optimum (certified)",
+            solver,
+            achieved,
+            lb,
+            gap * 100,
+        )
+
     def _apply_placement_common(self, sharding_placement):
         t0 = time.perf_counter()
         self._assert_entered()
diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py
index 5361a8a0..ec44d7df 100644
--- a/autoparallel/approximate_sharding.py
+++ b/autoparallel/approximate_sharding.py
@@ -182,8 +182,11 @@ def _solve(self, verbose: bool = False):
             logger.info(
                 "approx build: problem=%.2fs %s factors=%.2fs groups=%d "
                 "cost_bearing=%d edges=%d max_domain=%d",
-                t_bp - t0, getattr(self, "_build_times", {}), t_bf - t_bp,
-                len(self.groups), len(self.cost_bearing),
+                t_bp - t0,
+                getattr(self, "_build_times", {}),
+                t_bf - t_bp,
+                len(self.groups),
+                len(self.cost_bearing),
                 sum(len(v) for v in self.input_edges.values()),
                 max((g.domain for g in self.groups), default=0),
             )
@@ -196,12 +199,13 @@ def _solve(self, verbose: bool = False):
         t_bp0 = time.perf_counter()
         self._belief_propagation(deadline)
         if verbose:
-            logger.info("approx phase: trws iter=%s delta=%.4g in %.2fs; "
-                        "decode energy=%.1f",
-                        getattr(self, "_bp_last_iter", None),
-                        getattr(self, "_bp_last_delta", float("nan")),
-                        time.perf_counter() - t_bp0,
-                        self._fast_total_energy())
+            logger.info(
+                "approx phase: trws iter=%s delta=%.4g in %.2fs; " "decode energy=%.1f",
+                getattr(self, "_bp_last_iter", None),
+                getattr(self, "_bp_last_delta", float("nan")),
+                time.perf_counter() - t_bp0,
+                self._fast_total_energy(),
+            )
         self._memory_repair()
         self._coordinate_descent(deadline)
         if verbose:
@@ -314,7 +318,9 @@ def _build_problem(self):
         for v in self.cost_bearing:
             node = opt.nodes[v]
             self.allowed_out[v] = [
-                o for o in self.allowed_out[v] if not self._out_fully_forbidden(v, node, o)
+                o
+                for o in self.allowed_out[v]
+                if not self._out_fully_forbidden(v, node, o)
             ]
         t_forbid = time.perf_counter()
 
@@ -375,9 +381,9 @@ def _parse_constraints(self):
                     if pos_key is not None and neg_key is not None:
                         break
                 if pos_key is not None and neg_key is not None:
-                    authoritative.setdefault(
-                        (neg_key[0], neg_key[1]), set()
-                    ).add(pos_key[0])
+                    authoritative.setdefault((neg_key[0], neg_key[1]), set()).add(
+                        pos_key[0]
+                    )
                 continue
             if name.startswith(self._SKIP_PREFIXES):
                 continue
@@ -410,8 +416,11 @@ def _parse_constraints(self):
                 oa, ob = {k[2] for k in neg}, {k[2] for k in pos}
                 if len(na) == 1 and len(nb) == 1 and len(oa) == 1 and len(ob) == 1:
                     paired_edges.append(
-                        (next(iter(na)), next(iter(nb)),
-                         frozenset({(next(iter(oa)), next(iter(ob)))}))
+                        (
+                            next(iter(na)),
+                            next(iter(nb)),
+                            frozenset({(next(iter(oa)), next(iter(ob)))}),
+                        )
                     )
         # method="fix" axis pins leave no PuLP row to parse above, so replay the
         # log to recover them (constraint-method pins are also picked up here,
@@ -775,8 +784,11 @@ def candidates(m, assign):
                 if nb in assign and nb in member_set:
                     allowed = allow[(nb, m)].get(assign[nb], set())
                     cand = allowed if cand is None else (cand & allowed)
-            cand = set(self.allowed_out.get(m, [])) if cand is None else (
-                cand & set(self.allowed_out.get(m, [])))
+            cand = (
+                set(self.allowed_out.get(m, []))
+                if cand is None
+                else (cand & set(self.allowed_out.get(m, [])))
+            )
             return cand
 
         def dfs(i, assign):
@@ -797,7 +809,8 @@ def dfs(i, assign):
         if len(results) >= limit:
             logger.warning(
                 "Approximate solver: group of %d nodes hit group_domain_limit=%d.",
-                len(members), limit,
+                len(members),
+                limit,
             )
         return results
 
@@ -873,8 +886,9 @@ def _build_memory_info(self):
             for v in param_idxs:
                 r = ratios[v]
                 mn = min(r.values())
-                self.allowed_out[v] = [o for o in self.allowed_out[v]
-                                       if r[o] <= mn + 1e-12]
+                self.allowed_out[v] = [
+                    o for o in self.allowed_out[v] if r[o] <= mn + 1e-12
+                ]
         self._memory = {
             "param_idxs": param_idxs,
             "ratios": ratios,
@@ -1140,8 +1154,11 @@ def _coordinate_descent(self, deadline):
 
     def _star_block_search(self, deadline):
         ranked = sorted(
-            ((len(self.nbrs[g]), g) for g in range(len(self.groups))
-             if len(self.nbrs[g]) >= 2 and self.groups[g].domain > 1),
+            (
+                (len(self.nbrs[g]), g)
+                for g in range(len(self.groups))
+                if len(self.nbrs[g]) >= 2 and self.groups[g].domain > 1
+            ),
             reverse=True,
         )
         for _ in range(self.star_passes):
@@ -1213,34 +1230,48 @@ def _block_energy(self, gids):
     def _current_memory(self):
         if self._memory is None:
             return 0.0
-        return sum(self._memory["ratios"][v][self.cur_out[v]]
-                   for v in self._memory["param_idxs"])
+        return sum(
+            self._memory["ratios"][v][self.cur_out[v]]
+            for v in self._memory["param_idxs"]
+        )
 
     def _memory_ok_after(self, gid, ci):
         if self._memory is None or self._memory.get("tight"):
             return True
         ratios = self._memory["ratios"]
         choice = self.groups[gid].choices[ci]
-        delta = sum(ratios[m][o] - ratios[m][self.cur_out[m]]
-                    for m, o in choice.items() if m in ratios)
+        delta = sum(
+            ratios[m][o] - ratios[m][self.cur_out[m]]
+            for m, o in choice.items()
+            if m in ratios
+        )
         mem = self._current_memory() + delta
-        return (self._memory["budget_low"] - 1e-6 <= mem
-                <= self._memory["budget_high"] + 1e-6)
+        return (
+            self._memory["budget_low"] - 1e-6
+            <= mem
+            <= self._memory["budget_high"] + 1e-6
+        )
 
     def _block_memory_ok(self):
         if self._memory is None or self._memory.get("tight"):
             return True
         mem = self._current_memory()
-        return (self._memory["budget_low"] - 1e-6 <= mem
-                <= self._memory["budget_high"] + 1e-6)
+        return (
+            self._memory["budget_low"] - 1e-6
+            <= mem
+            <= self._memory["budget_high"] + 1e-6
+        )
 
     def _memory_repair(self):
         if self._memory is None or self._memory.get("tight"):
             return
         low, high = self._memory["budget_low"], self._memory["budget_high"]
         ratios = self._memory["ratios"]
-        param_groups = {self.node_to_group[v] for v in self._memory["param_idxs"]
-                        if v in self.node_to_group}
+        param_groups = {
+            self.node_to_group[v]
+            for v in self._memory["param_idxs"]
+            if v in self.node_to_group
+        }
         for _ in range(2 * max(1, len(param_groups))):
             mem = self._current_memory()
             if low - 1e-6 <= mem <= high + 1e-6:
@@ -1254,8 +1285,11 @@ def _memory_repair(self):
                     if ci == group.current:
                         continue
                     choice = group.choices[ci]
-                    dmem = sum(ratios[m][choice[m]] - ratios[m][self.cur_out[m]]
-                               for m in choice if m in ratios)
+                    dmem = sum(
+                        ratios[m][choice[m]] - ratios[m][self.cur_out[m]]
+                        for m in choice
+                        if m in ratios
+                    )
                     if (dmem < -1e-9) != over and abs(dmem) > 1e-9:
                         continue
                     if abs(dmem) <= 1e-9:
@@ -1264,8 +1298,13 @@ def _memory_repair(self):
                     if best is None or score < best[0]:
                         best = (score, gid, ci)
             if best is None:
-                logger.warning("Approximate solver: memory repair stuck at %.4f "
-                               "(budget=[%.4f,%.4f]).", mem, low, high)
+                logger.warning(
+                    "Approximate solver: memory repair stuck at %.4f "
+                    "(budget=[%.4f,%.4f]).",
+                    mem,
+                    low,
+                    high,
+                )
                 return
             self._set_group(best[1], best[2])
 
diff --git a/examples/_sanity_llama3.py b/examples/_sanity_llama3.py
index 71fc1122..6a44b386 100644
--- a/examples/_sanity_llama3.py
+++ b/examples/_sanity_llama3.py
@@ -179,28 +179,36 @@ def input_fn():
 
     try:
         losses = []
+        step_times = []
         for step in range(args.train_steps):
+            torch.cuda.synchronize(device)
+            t_step = time.perf_counter()
             optimizer.zero_grad(set_to_none=True)
             step_loss = torch.zeros((), device=device)
             for mi, ml in zip(input_mbs, label_mbs):
                 logits = parallel_mod(mi)
-                if torch.any(torch.isnan(logits)):
-                    raise RuntimeError("NaN in forward output")
                 loss = vocab_parallel_cross_entropy(
                     logits, ml, vocab_size=model_args.vocab_size, tp_group=tp_group,
                     tp_rank=tp_rank, tp_degree=tp_degree, normalizer=normalizer)
-                if torch.any(torch.isnan(loss)):
-                    raise RuntimeError("NaN in loss")
                 loss.backward()
                 step_loss = step_loss + loss.detach()
             torch.nn.utils.clip_grad_norm_(parallel_mod.parameters(), args.max_grad_norm)
             optimizer.step()
+            torch.cuda.synchronize(device)
+            step_times.append(time.perf_counter() - t_step)
             with torch.no_grad():
                 logged = step_loss.clone()
                 dist.all_reduce(logged, op=dist.ReduceOp.SUM)
             losses.append(float(logged.item()))
-            print_rank0(f"step={step:03d} loss={losses[-1]:.6f}")
-
+            print_rank0(f"step={step:03d} loss={losses[-1]:.6f} step_time={1000*step_times[-1]:.0f}ms")
+
+        warmup = min(3, max(0, len(step_times) - 2))
+        steady = sorted(step_times[warmup:])
+        if steady:
+            mean_ms = 1000 * sum(steady) / len(steady)
+            print_rank0(f"[latency] solver={args.solver} per-step (excl {warmup} warmup, "
+                        f"{len(steady)} steps): mean={mean_ms:.0f}ms "
+                        f"median={1000*steady[len(steady)//2]:.0f}ms min={1000*steady[0]:.0f}ms")
         print_rank0(f"\nloss curve: {[round(x, 4) for x in losses]}")
         verdict = "PASS" if losses[-1] < losses[0] else "FAIL"
         print_rank0(f"SANITY {verdict}: loss {losses[0]:.4f} -> {losses[-1]:.4f}")
diff --git a/tests/test_approximate_sharding.py b/tests/test_approximate_sharding.py
index 0383fad8..05d05314 100644
--- a/tests/test_approximate_sharding.py
+++ b/tests/test_approximate_sharding.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
 import math
 
 import pulp
@@ -89,6 +90,44 @@ def test_approx_objective_close_to_ilp():
         )
 
 
+@apply_cuda_patches
+@pytest.mark.filterwarnings("ignore:Constructing LpVariable")
+@pytest.mark.filterwarnings("ignore:Overwriting previously set objective")
+def test_lp_solver_matches_ilp():
+    """The LP-relaxation solver returns an integral, ILP-feasible assignment whose
+    objective equals the exact ILP optimum (the relaxation is integral here)."""
+    mesh = _fake_2d_mesh()
+    with _tiny_llama3_autop(mesh) as autop:
+        _add_constraints(autop, mesh)
+        opt = autop.sharding_optimizer
+
+        autop.optimize_placement(verbose=False, solver="lp")
+        lp_objective = pulp.value(opt.prob.objective)
+        violated = [n for n, c in opt.prob.constraints.items() if not c.valid()]
+        assert not violated, f"lp violated {len(violated)} constraints"
+
+        autop.optimize_placement(verbose=False, solver="ilp")
+        ilp_objective = pulp.value(opt.prob.objective)
+
+        assert math.isfinite(lp_objective)
+        assert lp_objective == pytest.approx(ilp_objective, rel=1e-6)
+
+
+@apply_cuda_patches
+@pytest.mark.filterwarnings("ignore:Constructing LpVariable")
+@pytest.mark.filterwarnings("ignore:Overwriting previously set objective")
+def test_optimality_check_logs_certified_gap(caplog):
+    """optimality_check=True solves the LP lower bound and logs the certified gap."""
+    mesh = _fake_2d_mesh()
+    with _tiny_llama3_autop(mesh) as autop:
+        _add_constraints(autop, mesh)
+        with caplog.at_level(logging.INFO, logger="autoparallel.api"):
+            autop.optimize_placement(
+                verbose=False, solver="approx", optimality_check=True
+            )
+        assert any("optimality check" in r.message for r in caplog.records)
+
+
 @apply_cuda_patches
 @pytest.mark.filterwarnings("ignore:Constructing LpVariable")
 def test_approx_objective_is_faithful():

From 1435b7bcde1fc5fad3088871a63b0b796046ac02 Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sun, 7 Jun 2026 11:00:11 -0700
Subject: [PATCH 24/27] Approx solver: memory-constrained solve via Lagrangian
 relaxation

The parameter-memory budget Sum_param ratio*x in [low,high] is a single
node-separable linear coupling, so penalizing it by lambda folds lambda*ratio
into the param unaries and leaves the pairwise MRF untouched. A scalar
bisection on lambda drives the achieved memory into the budget, and a
budget-constrained coordinate/star polish recovers integer solutions inside
the (memory, cost) hull that no single lambda reaches. Routed in only for
non-tight budgets; the tight default still uses build-time param pinning.

Authored with Claude.
---
 autoparallel/approximate_sharding.py | 214 ++++++++++++++++++++++--
 examples/_bench_mem_lagrangian.py    | 237 +++++++++++++++++++++++++++
 tests/test_approximate_sharding.py   |  35 ++++
 3 files changed, 470 insertions(+), 16 deletions(-)
 create mode 100644 examples/_bench_mem_lagrangian.py

diff --git a/autoparallel/approximate_sharding.py b/autoparallel/approximate_sharding.py
index ec44d7df..3e102da5 100644
--- a/autoparallel/approximate_sharding.py
+++ b/autoparallel/approximate_sharding.py
@@ -151,6 +151,11 @@ def __init__(
         self.consumers: dict[int, list[tuple[int, int]]] = defaultdict(list)
         self.cur_out: dict[int, int] = {}
         self._memory: Optional[dict[str, Any]] = None
+        # When False, the hard memory-budget checks in local search are skipped
+        # (used by the Lagrangian solve, which enforces the budget softly via a
+        # penalty folded into the unaries instead).
+        self._mem_enforce: bool = True
+        self._mem_unary: list[np.ndarray] = []
 
         # Populated by _build_factors().
         self.g_unary: list[np.ndarray] = []
@@ -197,23 +202,42 @@ def _solve(self, verbose: bool = False):
         # to be compared against is strictly dominated and has been dropped; the
         # polish remains for the memory budget and as a local-search safety net.
         t_bp0 = time.perf_counter()
-        self._belief_propagation(deadline)
-        if verbose:
-            logger.info(
-                "approx phase: trws iter=%s delta=%.4g in %.2fs; " "decode energy=%.1f",
-                getattr(self, "_bp_last_iter", None),
-                getattr(self, "_bp_last_delta", float("nan")),
-                time.perf_counter() - t_bp0,
-                self._fast_total_energy(),
+        mem = self._memory
+        if mem is not None and not mem.get("tight"):
+            # A non-tight budget can bind the runtime-optimal placement; solve it
+            # exactly via Lagrangian relaxation (folds λ·ratio into the unaries).
+            # A tight budget is already handled by build-time param pinning, and
+            # the no-memory case has nothing to relax, so both take the plain path.
+            res = self.solve_lagrangian(
+                mem["budget_low"],
+                mem["budget_high"],
+                deadline=deadline,
+                verbose=verbose,
             )
-        self._memory_repair()
-        self._coordinate_descent(deadline)
-        if verbose:
-            logger.info("approx phase: trws+cd energy=%.1f", self._fast_total_energy())
-        self._star_block_search(deadline)
+            if verbose:
+                logger.info(
+                    "approx phase: lagrangian lam=%.4g memory=%.4f feasible=%s",
+                    res["lam"],
+                    res["memory"],
+                    res["feasible"],
+                )
+        else:
+            self._belief_propagation(deadline)
+            if verbose:
+                logger.info(
+                    "approx phase: trws iter=%s delta=%.4g in %.2fs; "
+                    "decode energy=%.1f",
+                    getattr(self, "_bp_last_iter", None),
+                    getattr(self, "_bp_last_delta", float("nan")),
+                    time.perf_counter() - t_bp0,
+                    self._fast_total_energy(),
+                )
+            self._memory_repair()
+            self._coordinate_descent(deadline)
+            self._star_block_search(deadline)
         bp_energy = self._fast_total_energy()
         if verbose:
-            logger.info("approx phase: trws+cd+star energy=%.1f", bp_energy)
+            logger.info("approx phase: polished energy=%.1f", bp_energy)
         t_solve = time.perf_counter() - t0 - t_build
 
         objective = self._write_back()
@@ -1236,7 +1260,7 @@ def _current_memory(self):
         )
 
     def _memory_ok_after(self, gid, ci):
-        if self._memory is None or self._memory.get("tight"):
+        if self._memory is None or self._memory.get("tight") or not self._mem_enforce:
             return True
         ratios = self._memory["ratios"]
         choice = self.groups[gid].choices[ci]
@@ -1253,7 +1277,7 @@ def _memory_ok_after(self, gid, ci):
         )
 
     def _block_memory_ok(self):
-        if self._memory is None or self._memory.get("tight"):
+        if self._memory is None or self._memory.get("tight") or not self._mem_enforce:
             return True
         mem = self._current_memory()
         return (
@@ -1308,6 +1332,164 @@ def _memory_repair(self):
                 return
             self._set_group(best[1], best[2])
 
+    # ------------------------------------------------------------------ #
+    # Lagrangian memory-constrained solve
+    # ------------------------------------------------------------------ #
+    def _build_mem_unary(self):
+        """Per-group vector mem_unary[gid][ci] = Σ_{param member} ratio[member][ci],
+        i.e. the memory term as a node-separable unary so it folds into the
+        Lagrangian objective with no change to the pairwise structure."""
+        self._mem_unary = [np.zeros(g.domain) for g in self.groups]
+        if self._memory is None:
+            return
+        ratios = self._memory["ratios"]
+        for v in self._memory["param_idxs"]:
+            gid = self.node_to_group.get(v)
+            if gid is None:
+                continue
+            r = ratios[v]
+            self._mem_unary[gid] += np.array(
+                [r[c[v]] for c in self.groups[gid].choices]
+            )
+
+    def _run_search(self, deadline):
+        self._belief_propagation(deadline)
+        self._coordinate_descent(deadline)
+        self._star_block_search(deadline)
+
+    def solve_lagrangian(
+        self,
+        budget_low,
+        budget_high,
+        deadline=None,
+        max_iter=30,
+        lam_tol=1e-9,
+        verbose=False,
+    ):
+        """Memory-constrained solve via Lagrangian relaxation.
+
+        The budget Σ_param ratio[v][x_v] ∈ [low, high] is a single linear,
+        node-separable coupling. Penalizing it by λ folds λ·ratio into each param
+        node's unary and leaves the pairwise MRF untouched, so TRW-S + polish
+        solves the penalized problem directly. a(λ) := Σ ratio at the optimum is
+        monotone non-increasing in λ (larger λ ⇒ more sharding ⇒ less memory), so
+        a scalar bisection on λ ≥ 0 drives a(λ) into the budget. The cheapest
+        feasible assignment seen is kept; the existing greedy repair only closes
+        any small residual from integrality.
+
+        Leaves the solver at the chosen assignment (does not write back) and
+        returns a dict: objective (true), memory (achieved a), lam, feasible,
+        iters."""
+        if not self._mem_unary:
+            self._build_mem_unary()
+        t_start = time.perf_counter()
+        if deadline is None:
+            deadline = t_start + self.max_time_s
+        # Reserve the tail of the budget for the constrained polish below.
+        bisect_deadline = t_start + 0.6 * (deadline - t_start)
+        base = [u.copy() for u in self.g_unary]
+        prev_enforce = self._mem_enforce
+        self._mem_enforce = False
+        eps = 1e-6
+
+        best = {"objective": INF, "snapshot": None, "memory": None, "lam": None}
+        # Closest over-budget assignment (smallest excess memory) — the seed the
+        # repair step nudges down into the budget to recover integer solutions
+        # that lie inside the (memory, cost) hull and so no lambda can reach.
+        seed = {"memory": INF, "snapshot": None}
+
+        def evaluate(lam):
+            for gid in range(len(self.groups)):
+                self.g_unary[gid] = base[gid] + lam * self._mem_unary[gid]
+            self._run_search(bisect_deadline)
+            a = self._current_memory()
+            obj = self.total_objective()
+            feasible = budget_low - eps <= a <= budget_high + eps
+            if feasible and obj < best["objective"]:
+                best.update(
+                    objective=obj,
+                    snapshot=[g.current for g in self.groups],
+                    memory=a,
+                    lam=lam,
+                )
+            if budget_high + eps < a < seed["memory"]:
+                seed.update(memory=a, snapshot=[g.current for g in self.groups])
+            if verbose:
+                logger.info(
+                    "lagrangian: lam=%.6g memory=%.5f obj=%.2f feasible=%s",
+                    lam,
+                    a,
+                    obj,
+                    feasible,
+                )
+            return a
+
+        a0 = evaluate(0.0)
+        iters = 1
+        if a0 <= budget_high + eps:
+            lam = 0.0  # unconstrained optimum already fits the budget
+        else:
+            lo_lam, hi_lam = 0.0, 1.0
+            while evaluate(hi_lam) > budget_high + eps and iters < max_iter:
+                lo_lam, hi_lam = hi_lam, hi_lam * 2.0
+                iters += 1
+            while iters < max_iter and hi_lam - lo_lam > lam_tol:
+                mid = 0.5 * (lo_lam + hi_lam)
+                a = evaluate(mid)
+                iters += 1
+                if a > budget_high + eps:
+                    lo_lam = mid  # still over budget, penalize harder
+                else:
+                    hi_lam = mid  # feasible, try to relax toward the cheaper side
+            lam = hi_lam
+
+        for gid in range(len(self.groups)):
+            self.g_unary[gid] = base[gid]
+        self._mem_enforce = prev_enforce
+
+        # Constrained polish (under the base unary, budget enforced). No single λ
+        # recovers integer solutions inside the (memory, cost) hull; coordinate +
+        # star search restricted to the budget can climb from an over-sharded
+        # point back up to a cheaper intermediate-memory one. We polish both the
+        # bisection's feasible point and the repaired closest-over-budget seed and
+        # keep the cheapest feasible result.
+        def polish(snapshot):
+            for gid, ci in enumerate(snapshot):
+                self._set_group(gid, ci)
+            self._memory_repair()
+            self._coordinate_descent(deadline)
+            self._star_block_search(deadline)
+            a = self._current_memory()
+            if budget_low - eps <= a <= budget_high + eps:
+                obj = self.total_objective()
+                if obj < best["objective"]:
+                    best.update(
+                        objective=obj,
+                        snapshot=[g.current for g in self.groups],
+                        memory=a,
+                        lam=lam,
+                    )
+
+        for snap in (best["snapshot"], seed["snapshot"]):
+            if snap is not None:
+                polish(snap)
+
+        if best["snapshot"] is not None:
+            for gid, ci in enumerate(best["snapshot"]):
+                self._set_group(gid, ci)
+        else:
+            # Nothing landed in [low, high]; repair the last assignment in place.
+            self._memory_repair()
+
+        a = self._current_memory()
+        return {
+            "objective": self.total_objective(),
+            "memory": a,
+            "lam": lam,
+            "feasible": budget_low - eps <= a <= budget_high + eps,
+            "iters": iters,
+        }
+
     # ------------------------------------------------------------------ #
     # Write-back
     # ------------------------------------------------------------------ #
diff --git a/examples/_bench_mem_lagrangian.py b/examples/_bench_mem_lagrangian.py
new file mode 100644
index 00000000..6166a552
--- /dev/null
+++ b/examples/_bench_mem_lagrangian.py
@@ -0,0 +1,237 @@
+"""Compare the Lagrangian memory-constrained approximate solve against the LP
+(relaxation) optimum across a sweep of parameter-memory budgets.
+
+The optimizer (the expensive build) is constructed ONCE; each budget only
+re-runs the cheap solves. For every budget factor `high` (with low=0):
+  - LP: set the memory constraint and solve the (integral) relaxation -> the
+    exact constrained optimum (gold standard).
+  - Lagrangian approx: fold lambda * ratio into the unaries and bisect lambda
+    until the achieved memory lands in the same [low, high] budget.
+The two solvers are pinned to the SAME numeric budget (read back from the LP's
+constraint rows) so the comparison is apples-to-apples.
+
+Env: MODEL_TYPE (1b|8b), MESH ("8,8"), N_LAYERS (0=default), SEQLEN,
+HIGH_FACTORS (comma list, default sweep), BP_ITERS.
+"""
+import logging
+import os
+import time
+from unittest.mock import patch
+
+import pulp
+import torch
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.tensor.placement_types import Replicate, Shard
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
+from autoparallel.api import AutoParallel
+from autoparallel.approximate_sharding import ApproximateShardingSolver
+from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
+from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
+
+logging.basicConfig(level=logging.ERROR)
+
+
+def log(msg):
+    print(msg, flush=True)
+
+
+_PATCHES = [
+    patch("torch.cuda.device_count", lambda: 8),
+    patch("torch.cuda.get_device_name", lambda *a, **k: "H100"),
+    patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)),
+    patch(
+        "torch.cuda.get_device_properties",
+        lambda *a, **k: type(
+            "P",
+            (),
+            {
+                "major": 9,
+                "minor": 0,
+                "name": "H100",
+                "total_memory": 80 * 1024**3,
+                "multi_processor_count": 132,
+            },
+        )(),
+    ),
+]
+for p in _PATCHES:
+    p.start()
+
+MODEL_TYPE = os.environ.get("MODEL_TYPE", "1b")
+N_LAYERS = int(os.environ.get("N_LAYERS", "0"))
+SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4)))
+MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
+BP_ITERS = int(os.environ.get("BP_ITERS", "120"))
+HIGH_FACTORS = [
+    float(x)
+    for x in os.environ.get(
+        "HIGH_FACTORS", "0.0156,0.03125,0.0625,0.125,0.25,0.5,1.0"
+    ).split(",")
+]
+# On budgets where the LP relaxation is fractional (its optimum is an
+# unachievable lower bound) also solve the true ILP to report the achievable gap.
+RUN_ILP = os.environ.get("RUN_ILP", "0") == "1"
+ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "300"))
+
+world_size = 1
+for d in MESH_SHAPE:
+    world_size *= d
+
+_NAMES = {1: ("dp",), 2: ("dp", "tp"), 3: ("dp", "cp", "tp")}
+mesh_names = _NAMES[len(MESH_SHAPE)]
+fake_store = FakeStore()
+torch.distributed.init_process_group(
+    "fake", store=fake_store, rank=0, world_size=world_size
+)
+mesh = torch.distributed.device_mesh.init_device_mesh(
+    "cuda", MESH_SHAPE, mesh_dim_names=mesh_names
+)
+
+vocab_size = 128256
+batch_size = int(os.environ.get("BATCH", str(2 * mesh.shape[0])))
+seqlen = SEQLEN
+
+
+def model_fn():
+    args = TransformerModelArgs(
+        dim=2048,
+        n_layers=16,
+        n_heads=32,
+        n_kv_heads=8,
+        ffn_dim_multiplier=1.5,
+        multiple_of=256,
+        rope_theta=500000,
+        vocab_size=vocab_size,
+        max_seq_len=seqlen,
+    )
+    if MODEL_TYPE == "8b":
+        args = TransformerModelArgs(
+            dim=4096,
+            n_layers=32,
+            n_heads=32,
+            n_kv_heads=8,
+            ffn_dim_multiplier=1.3,
+            multiple_of=1024,
+            rope_theta=500000,
+            vocab_size=vocab_size,
+            max_seq_len=seqlen,
+        )
+    if N_LAYERS:
+        args.n_layers = N_LAYERS
+    with torch.device("meta"):
+        return Transformer(args)
+
+
+def input_fn():
+    return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda")
+
+
+set_nccl_topo_config(detect_nccl_topo_config(mesh))
+mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
+
+log(
+    f"model={MODEL_TYPE} layers={N_LAYERS or 'default'} mesh={MESH_SHAPE} "
+    f"world={world_size} seqlen={seqlen} bp_iters={BP_ITERS}"
+)
+
+# ---- build once ----
+t = time.perf_counter()
+autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True)
+autop.__enter__()
+ndim = mesh.ndim
+x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1)
+out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding
+# Build with a LOOSE budget so the approx build does not pin params to the
+# min-ratio (fully-sharded) choices; the per-budget sweep overrides the budget
+# numerically afterward. (A tight default would prune param strategies at build
+# time and freeze the achievable memory.)
+autop.add_parameter_memory_constraint(low=0.0, high=1.0)
+autop.add_input_constraints([x_sharding])
+autop.add_output_constraints([out_sharding])
+opt = autop.sharding_optimizer
+log(
+    f"[build] optimizer ready in {time.perf_counter() - t:.2f}s "
+    f"vars={len(opt.pulp_variables)} nodes={len(opt.nodes)}"
+)
+
+# build the approximate solver once (ratios / factor graph / mem unary cached)
+t = time.perf_counter()
+approx = ApproximateShardingSolver(opt, bp_iters=BP_ITERS)
+approx._build_problem()
+approx._build_factors()
+approx._build_mem_unary()
+log(
+    f"[build] approx solver ready in {time.perf_counter() - t:.2f}s "
+    f"groups={len(approx.groups)} "
+    f"params={len(approx._memory['param_idxs']) if approx._memory else 0}"
+)
+opt._set_objective()
+
+
+def lp_budget():
+    """Read back the exact [low, high] the LP applied, so approx uses the same."""
+    ch = opt.prob.constraints["memory_constraint_high"]
+    cl = opt.prob.constraints["memory_constraint_low"]
+    return -cl.constant, -ch.constant
+
+
+log("\n" + "=" * 110)
+log(
+    f"{'high_f':>8} | {'budget':>16} | {'LP obj':>12} {'frac':>7} {'LP s':>6} | "
+    f"{'approx obj':>12} {'mem':>7} {'lam':>9} {'feas':>5} {'s':>5} | "
+    f"{'gap/LP':>7} {'ILP obj':>12} {'gap/ILP':>8}"
+)
+log("-" * 110)
+
+rows = []
+for hf in HIGH_FACTORS:
+    opt._memory_constraint = (0.0, hf)
+    t = time.perf_counter()
+    lp = opt.solve_lp_relaxation(verbose=False, extract=False)
+    lp_s = time.perf_counter() - t
+    lp_obj = lp["objective"]
+    frac = f"{lp['n_fractional']}/{lp['n_vars']}"
+    blow, bhigh = lp_budget()
+
+    approx._memory["budget_low"] = blow
+    approx._memory["budget_high"] = bhigh
+    approx._memory["tight"] = abs(bhigh - blow) < 1e-9
+    t = time.perf_counter()
+    res = approx.solve_lagrangian(blow, bhigh, max_iter=24)
+    ap_s = time.perf_counter() - t
+    ap_obj = res["objective"]
+    gap = (ap_obj - lp_obj) / lp_obj * 100 if lp_obj else float("nan")
+
+    ilp_obj, gap_ilp = None, None
+    if RUN_ILP and lp["n_fractional"] > 0:
+        opt._set_objective()
+        opt._apply_memory_constraint()
+        opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, timeLimit=ILP_TIMEOUT))
+        ilp_obj = pulp.value(opt.prob.objective)
+        gap_ilp = (ap_obj - ilp_obj) / ilp_obj * 100 if ilp_obj else float("nan")
+
+    rows.append((hf, lp_obj, ap_obj, gap, res["feasible"], ilp_obj, gap_ilp))
+    log(
+        f"{hf:>8.4g} | [{blow:>6.3f},{bhigh:>7.3f}] | {lp_obj:>12.1f} {frac:>7} "
+        f"{lp_s:>5.1f}s | {ap_obj:>12.1f} {res['memory']:>7.3f} {res['lam']:>9.4g} "
+        f"{str(res['feasible']):>5} {ap_s:>4.1f}s | {gap:>+6.2f}% "
+        f"{('%.1f' % ilp_obj) if ilp_obj else '-':>12} "
+        f"{('%+.2f%%' % gap_ilp) if gap_ilp is not None else '-':>8}"
+    )
+
+log("=" * 110)
+gaps = [r[3] for r in rows if r[1]]
+feas = [r[4] for r in rows]
+if gaps:
+    log(
+        f"gap vs LP: mean={sum(gaps)/len(gaps):+.2f}% max={max(gaps):+.2f}% "
+        f"min={min(gaps):+.2f}%  feasible={sum(feas)}/{len(feas)}"
+    )
+gi = [r[6] for r in rows if r[6] is not None]
+if gi:
+    log(
+        f"gap vs ILP (fractional-LP budgets): mean={sum(gi)/len(gi):+.2f}% "
+        f"max={max(gi):+.2f}%"
+    )
diff --git a/tests/test_approximate_sharding.py b/tests/test_approximate_sharding.py
index 05d05314..b75ebe52 100644
--- a/tests/test_approximate_sharding.py
+++ b/tests/test_approximate_sharding.py
@@ -90,6 +90,41 @@ def test_approx_objective_close_to_ilp():
         )
 
 
+@apply_cuda_patches
+@pytest.mark.filterwarnings("ignore:Constructing LpVariable")
+@pytest.mark.filterwarnings("ignore:Overwriting previously set objective")
+def test_approx_memory_constrained_matches_ilp():
+    """A non-tight parameter-memory budget routes the approx solver through the
+    Lagrangian relaxation. The result must respect the budget and stay within a
+    small objective gap of the budget-constrained ILP optimum."""
+    mesh = _fake_2d_mesh()
+    with _tiny_llama3_autop(mesh) as autop:
+        # high=0.5 > 1/world_size, so the budget is non-tight (params are not
+        # pinned at build time) and can bind the runtime-optimal placement.
+        autop.add_parameter_memory_constraint(low=0.0, high=0.5)
+        autop.add_input_constraints([(Shard(0),) + (Replicate(),) * (mesh.ndim - 1)])
+        autop.add_output_constraints([(Shard(0), Shard(2))])
+        opt = autop.sharding_optimizer
+
+        autop.optimize_placement(verbose=False, solver="approx")
+        approx_objective = pulp.value(opt.prob.objective)
+        # Materialize the memory rows and check the approx assignment against ALL
+        # constraints, including the budget it was solved under.
+        opt._apply_memory_constraint()
+        violated = [n for n, c in opt.prob.constraints.items() if not c.valid()]
+        assert not violated, f"approx violated {len(violated)} constraints"
+
+        autop.optimize_placement(verbose=False, solver="ilp")
+        ilp_objective = pulp.value(opt.prob.objective)
+
+        assert math.isfinite(approx_objective)
+        assert approx_objective >= ilp_objective - 1e-6  # ILP is optimal
+        assert approx_objective <= ilp_objective * 1.05 + 1e-6, (
+            f"approx={approx_objective} ilp={ilp_objective} "
+            f"gap={(approx_objective / ilp_objective - 1) * 100:.2f}%"
+        )
+
+
 @apply_cuda_patches
 @pytest.mark.filterwarnings("ignore:Constructing LpVariable")
 @pytest.mark.filterwarnings("ignore:Overwriting previously set objective")

From 99339c3015f91bb545d6fa2089f474e2a319952c Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Sun, 7 Jun 2026 11:17:40 -0700
Subject: [PATCH 25/27] Mark annotation propagation and DP solver as
 experimental

The Shardy-like annotation propagation (annotate_* / propagate_annotations)
and the DP-based solver are opt-in and off by default: annotations do nothing
unless explicitly propagated before optimize_placement(), and the DP solver is
only reachable via the non-default solver_backend="dp" (not exposed through
AutoParallel) and still raises NotImplementedError. Document them as
experimental / unstable so the default solve path is unambiguous.

Authored with Claude.
---
 autoparallel/api.py               | 12 ++++++++++--
 autoparallel/optimize_sharding.py | 21 +++++++++++++--------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/autoparallel/api.py b/autoparallel/api.py
index ef664dc4..5e99d7d4 100644
--- a/autoparallel/api.py
+++ b/autoparallel/api.py
@@ -378,6 +378,11 @@ def add_output_constraints(self, constraints):
         self.output_constraints = constraints
 
     # ---- Sharding annotations (Shardy-like propagation) ----
+    # EXPERIMENTAL: opt-in only. These have no effect unless you call an
+    # annotate_* method and then propagate_annotations() before
+    # optimize_placement(); the default solve path never invokes them. The
+    # propagation may shrink the search space in ways that move the objective off
+    # the full-ILP optimum, so treat results as unstable.
 
     def _normalize_placements(self, placements):
         """Pad/validate a placement tuple to mesh.ndim, leaving missing trailing
@@ -499,12 +504,15 @@ def _mirror_annotations_to_backward(self):
         return mirrored
 
     def propagate_annotations(self, verbose=True, aggressive=False, method="fix"):
-        """Propagate the registered annotations Shardy-style and turn the
+        """EXPERIMENTAL (opt-in, off by default; may be unstable).
+
+        Propagate the registered annotations Shardy-style and turn the
         unambiguously-determined nodes into ILP constraints, shrinking the
         search space.  Returns a :class:`PropagationResult`.
 
         Call this after the ``annotate_*`` / ``add_*_constraint`` calls and
-        before :meth:`optimize_placement`.
+        before :meth:`optimize_placement`.  The default solve path does not call
+        this; nothing happens unless you invoke it explicitly.
 
         With ``aggressive=False`` (the default) only genuine ``Shard`` axes are
         pinned, which keeps the full-ILP optimum reachable.  ``aggressive=True``
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index 9e73889e..b14cb142 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -172,9 +172,7 @@ def _par_node_edge_costs(node_idx):
         arg_rows = []
         for argi, redist_costs in enumerate(output_strategy.redistribute_cost):
             producer_strategy = (
-                producer_strategies[argi]
-                if argi < len(producer_strategies)
-                else None
+                producer_strategies[argi] if argi < len(producer_strategies) else None
             )
             arg_rows.append(
                 [
@@ -193,7 +191,6 @@ def _par_node_edge_costs(node_idx):
     return node_idx, out_data
 
 
-
 def concretize_symint(val):
     """Concretize a SymInt to a plain int, pass through other values.
 
@@ -308,6 +305,14 @@ class DPTopology:
 
 
 class DPBasedShardingSolver:
+    """EXPERIMENTAL / incomplete — not part of the supported solver path.
+
+    Only reachable when ``ShardingOptimizer`` is built with the non-default
+    ``solver_backend="dp"`` (not exposed through ``AutoParallel``), and today it
+    only builds a topological order: :meth:`get_solution` raises
+    ``NotImplementedError``. Kept for in-progress work; do not rely on it.
+    """
+
     def __init__(self, optimizer):
         self.optimizer = optimizer
         self.topology: Optional[DPTopology] = None
@@ -1133,9 +1138,7 @@ def _compute_node_edge_costs(self, root_idxs):
                 # order as the serial path. This keeps the PuLP objective's
                 # lpSum term order identical too, so even the ILP path is
                 # bit-for-bit unchanged (float addition is not associative).
-                return list(
-                    pool.imap(_par_node_edge_costs, root_idxs, chunksize=4)
-                )
+                return list(pool.imap(_par_node_edge_costs, root_idxs, chunksize=4))
         finally:
             _FORK_OPT = None
 
@@ -1167,7 +1170,9 @@ def _find_decision_var(self, node_idx, argi, out_idx):
         that only need per-strategy costs can use whichever edge survived.
         """
         strategy = self.strats[self.nodes[node_idx]].strategies[out_idx]
-        n_inp = len(strategy.redistribute_cost[argi]) if strategy.redistribute_cost else 1
+        n_inp = (
+            len(strategy.redistribute_cost[argi]) if strategy.redistribute_cost else 1
+        )
         for inp_idx in range(n_inp):
             key = (node_idx, argi, out_idx, inp_idx)
             if key in self.decision_vars:

From 65c95036f817ea172f4cb4c8004a45a3b7b6f8ca Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Mon, 8 Jun 2026 12:08:51 -0700
Subject: [PATCH 26/27] Prepare approx-solver PR: drop qwen3/scratch benches,
 fix test cache isolation

Excludes the Qwen3 model + examples (shipped in a separate PR) and the scratch
_bench_/_sanity scripts from this branch, and applies black/isort formatting.

Adds an autouse conftest fixture that clears the process-global
placement-options cache before each test, so an optimizer build never reuses
stale strategies from a prior test's model (this otherwise made
test_lp_relaxation fail when run after test_approximate_sharding).

Authored with Claude.
---
 autoparallel/_testing/models/dsv3.py       |   2 +-
 autoparallel/_testing/models/qwen3.py      | 976 ---------------------
 autoparallel/serialization.py              |   3 +-
 examples/_bench_3d_cert.py                 | 154 ----
 examples/_bench_anno.py                    | 116 ---
 examples/_bench_approx.py                  | 166 ----
 examples/_bench_approx_diag.py             | 173 ----
 examples/_bench_approx_ils.py              | 136 ---
 examples/_bench_approx_sweep.py            | 106 ---
 examples/_bench_build_profile.py           |  93 --
 examples/_bench_build_verify.py            |  92 --
 examples/_bench_dp_alone.py                | 103 ---
 examples/_bench_lp_3d.py                   | 107 ---
 examples/_bench_lp_integrality.py          | 118 ---
 examples/_bench_mem_lagrangian.py          | 237 -----
 examples/_bench_merge.py                   | 293 -------
 examples/_bench_sizes.py                   | 166 ----
 examples/_bench_trws.py                    | 173 ----
 examples/_sanity_llama3.py                 | 223 -----
 examples/example_llama3.py                 |   3 -
 examples/example_qwen3.py                  | 242 -----
 examples/example_sanity_check_qwen3.py     | 335 -------
 examples/example_sanity_check_qwen3_moe.py | 466 ----------
 examples/example_torchtitan_qwen3_dense.py | 370 --------
 tests/conftest.py                          |  10 +
 tests/test_dsv3_torchtitan_config.py       |  35 -
 tests/test_optimize_placement.py           |   4 +-
 tests/test_qwen3.py                        | 323 -------
 28 files changed, 13 insertions(+), 5212 deletions(-)
 delete mode 100644 autoparallel/_testing/models/qwen3.py
 delete mode 100644 examples/_bench_3d_cert.py
 delete mode 100644 examples/_bench_anno.py
 delete mode 100644 examples/_bench_approx.py
 delete mode 100644 examples/_bench_approx_diag.py
 delete mode 100644 examples/_bench_approx_ils.py
 delete mode 100644 examples/_bench_approx_sweep.py
 delete mode 100644 examples/_bench_build_profile.py
 delete mode 100644 examples/_bench_build_verify.py
 delete mode 100644 examples/_bench_dp_alone.py
 delete mode 100644 examples/_bench_lp_3d.py
 delete mode 100644 examples/_bench_lp_integrality.py
 delete mode 100644 examples/_bench_mem_lagrangian.py
 delete mode 100644 examples/_bench_merge.py
 delete mode 100644 examples/_bench_sizes.py
 delete mode 100644 examples/_bench_trws.py
 delete mode 100644 examples/_sanity_llama3.py
 delete mode 100644 examples/example_qwen3.py
 delete mode 100644 examples/example_sanity_check_qwen3.py
 delete mode 100644 examples/example_sanity_check_qwen3_moe.py
 delete mode 100644 examples/example_torchtitan_qwen3_dense.py
 delete mode 100644 tests/test_dsv3_torchtitan_config.py
 delete mode 100644 tests/test_qwen3.py

diff --git a/autoparallel/_testing/models/dsv3.py b/autoparallel/_testing/models/dsv3.py
index 05f78a92..5a897b71 100644
--- a/autoparallel/_testing/models/dsv3.py
+++ b/autoparallel/_testing/models/dsv3.py
@@ -1581,7 +1581,7 @@ def __init__(
                 route_norm=moe_cfg.router.route_norm,
                 route_scale=moe_cfg.router.route_scale,
                 score_before_experts=moe_cfg.experts.token_dispatcher.score_before_experts,
-                use_grouped_mm=getattr(moe_cfg.experts, "use_grouped_mm", True),
+                use_grouped_mm=moe_cfg.experts.use_grouped_mm,
                 load_balance_coeff=moe_cfg.load_balance_coeff,
                 mesh=mesh,
                 compute_dtype=compute_dtype,
diff --git a/autoparallel/_testing/models/qwen3.py b/autoparallel/_testing/models/qwen3.py
deleted file mode 100644
index 7bef8b17..00000000
--- a/autoparallel/_testing/models/qwen3.py
+++ /dev/null
@@ -1,976 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from dataclasses import dataclass
-from typing import Callable, ClassVar, Optional
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torch.distributed.tensor import DeviceMesh
-from torch.distributed.tensor.placement_types import Partial, Replicate, Shard
-from torch.fx import traceback as fx_traceback
-from torch.nn.attention import sdpa_kernel, SDPBackend
-
-from autoparallel._testing.models.dsv3 import (
-    _permute,
-    _run_experts_for_loop,
-    _run_experts_grouped_mm,
-    _token_combine,
-)
-from autoparallel.collectives import all_to_all, axis_size, local_map
-
-
-def has_cuda_capability(major: int, minor: int) -> bool:
-    return torch.cuda.is_available() and torch.cuda.get_device_capability() >= (
-        major,
-        minor,
-    )
-
-
-class ScaledDotProductAttention(torch.nn.Module):
-    backends: ClassVar[list[SDPBackend]] = []
-
-    def __init__(self, attn_mask_type: str) -> None:
-        super().__init__()
-        if attn_mask_type != "causal":
-            raise ValueError("Qwen3 with SDPA currently only supports causal mask.")
-
-        ScaledDotProductAttention._init_backend()
-
-    @classmethod
-    def _init_backend(cls) -> None:
-        if cls.backends:
-            return
-
-        cls.backends = [
-            SDPBackend.FLASH_ATTENTION,
-            SDPBackend.EFFICIENT_ATTENTION,
-            SDPBackend.MATH,
-        ]
-        if has_cuda_capability(10, 0):
-            cls.backends.insert(0, SDPBackend.CUDNN_ATTENTION)
-
-    def forward(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        scale: float | None = None,
-    ) -> torch.Tensor:
-        assert self.backends, "SDPA backends should not be empty."
-        with sdpa_kernel(self.backends, set_priority=True):
-            return F.scaled_dot_product_attention(
-                q,
-                k,
-                v,
-                is_causal=True,
-                scale=scale,
-            )
-
-
-def build_attention(attn_mask_type: str):
-    if attn_mask_type != "causal":
-        raise ValueError("Qwen3 with SDPA currently only supports causal mask.")
-    return ScaledDotProductAttention(attn_mask_type)
-
-
-@dataclass
-class Qwen3ModelArgs:
-    dim: int = 4096
-    n_layers: int = 36
-    n_heads: int = 32
-    n_kv_heads: Optional[int] = 8
-    head_dim: int = 128
-    hidden_dim: int = 12288
-    vocab_size: int = 151936
-    norm_eps: float = 1e-6
-    rope_theta: float = 1000000.0
-    max_seq_len: int = 4096
-    depth_init: bool = True
-    attn_mask_type: str = "causal"
-    eos_id: int = 0
-    enable_weight_tying: bool = False
-    moe_enabled: bool = False
-    moe_hidden_dim: int = 768
-    num_experts: int = 64
-    top_k: int = 8
-    route_norm: bool = True
-    route_scale: float = 1.0
-    score_before_experts: bool = False
-    use_grouped_mm: bool = True
-    load_balance_coeff: Optional[float] = 1e-3
-    moe_axis_name: str = "ep"
-
-    def __post_init__(self) -> None:
-        n_kv_heads = self.n_heads if self.n_kv_heads is None else self.n_kv_heads
-        if self.n_heads % n_kv_heads != 0:
-            raise ValueError(
-                f"n_heads ({self.n_heads}) must be divisible by "
-                f"n_kv_heads ({n_kv_heads})."
-            )
-        if self.moe_enabled and self.top_k > self.num_experts:
-            raise ValueError(
-                f"top_k ({self.top_k}) must be <= num_experts ({self.num_experts})."
-            )
-
-    def update_from_config(self, job_config, tokenizer) -> None:
-        self.vocab_size = tokenizer.n_words
-        self.max_seq_len = job_config.training.seq_len
-        self.eos_id = tokenizer.eos_id
-
-    def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
-        nparams = sum(p.numel() for p in model.parameters())
-        nparams_embedding = sum(
-            sum(p.numel() for p in m.parameters())
-            for m in model.children()
-            if isinstance(m, nn.Embedding)
-        )
-
-        l, h, q, t = (
-            self.n_layers,
-            self.n_heads,
-            self.head_dim,
-            seq_len,
-        )
-        num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
-        return nparams, num_flops_per_token
-
-
-def qwen3_args_from_torchtitan_config(config) -> Qwen3ModelArgs:
-    """Build AutoParallel Qwen3 args from TorchTitan's Qwen3Model.Config."""
-    if not config.layers:
-        raise ValueError("Qwen3 config must contain at least one layer.")
-
-    first_layer = config.layers[0]
-    attention = first_layer.attention
-    moe = first_layer.moe
-
-    if getattr(attention, "fuse_qkv", False):
-        raise ValueError("AutoParallel Qwen3 does not support fused QKV yet.")
-
-    moe_enabled = moe is not None
-    if moe_enabled:
-        hidden_dim = 0
-        moe_hidden_dim = moe.experts.hidden_dim
-        num_experts = moe.num_experts
-        top_k = moe.router.top_k
-        route_norm = moe.router.route_norm
-        route_scale = moe.router.route_scale
-        score_before_experts = moe.experts.token_dispatcher.score_before_experts
-        load_balance_coeff = moe.load_balance_coeff
-    else:
-        hidden_dim = first_layer.feed_forward.w1.out_features
-        moe_hidden_dim = 0
-        num_experts = 0
-        top_k = 1
-        route_norm = True
-        route_scale = 1.0
-        score_before_experts = False
-        load_balance_coeff = None
-
-    return Qwen3ModelArgs(
-        dim=config.dim,
-        n_layers=len(config.layers),
-        n_heads=attention.n_heads,
-        n_kv_heads=attention.n_kv_heads,
-        head_dim=attention.head_dim,
-        hidden_dim=hidden_dim,
-        vocab_size=config.vocab_size,
-        norm_eps=config.norm.eps,
-        rope_theta=config.rope.theta,
-        max_seq_len=config.rope.max_seq_len,
-        attn_mask_type=attention.mask_type,
-        enable_weight_tying=config.enable_weight_tying,
-        moe_enabled=moe_enabled,
-        moe_hidden_dim=moe_hidden_dim,
-        num_experts=num_experts,
-        top_k=top_k,
-        route_norm=route_norm,
-        route_scale=route_scale,
-        score_before_experts=score_before_experts,
-        load_balance_coeff=load_balance_coeff,
-    )
-
-
-def qwen3_debug_args(**overrides) -> Qwen3ModelArgs:
-    args = Qwen3ModelArgs(
-        dim=256,
-        n_layers=8,
-        n_heads=16,
-        n_kv_heads=8,
-        head_dim=128,
-        hidden_dim=3072,
-        vocab_size=2048,
-        max_seq_len=4096,
-        enable_weight_tying=True,
-    )
-    for key, value in overrides.items():
-        setattr(args, key, value)
-    args.__post_init__()
-    return args
-
-
-def qwen3_0_6b_args(**overrides) -> Qwen3ModelArgs:
-    args = Qwen3ModelArgs(
-        dim=1024,
-        n_layers=28,
-        n_heads=16,
-        n_kv_heads=8,
-        head_dim=128,
-        hidden_dim=3072,
-        vocab_size=151936,
-        enable_weight_tying=True,
-    )
-    for key, value in overrides.items():
-        setattr(args, key, value)
-    args.__post_init__()
-    return args
-
-
-def qwen3_1_7b_args(**overrides) -> Qwen3ModelArgs:
-    args = Qwen3ModelArgs(
-        dim=2048,
-        n_layers=28,
-        n_heads=16,
-        n_kv_heads=8,
-        head_dim=128,
-        hidden_dim=6144,
-        vocab_size=151936,
-        enable_weight_tying=True,
-    )
-    for key, value in overrides.items():
-        setattr(args, key, value)
-    args.__post_init__()
-    return args
-
-
-def qwen3_4b_args(**overrides) -> Qwen3ModelArgs:
-    args = Qwen3ModelArgs(
-        dim=2560,
-        n_layers=36,
-        n_heads=32,
-        n_kv_heads=8,
-        head_dim=128,
-        hidden_dim=9728,
-        vocab_size=151936,
-        enable_weight_tying=True,
-    )
-    for key, value in overrides.items():
-        setattr(args, key, value)
-    args.__post_init__()
-    return args
-
-
-def qwen3_8b_args(**overrides) -> Qwen3ModelArgs:
-    args = Qwen3ModelArgs()
-    for key, value in overrides.items():
-        setattr(args, key, value)
-    args.__post_init__()
-    return args
-
-
-def qwen3_moe_debug_args(**overrides) -> Qwen3ModelArgs:
-    args = Qwen3ModelArgs(
-        dim=256,
-        n_layers=8,
-        n_heads=16,
-        n_kv_heads=8,
-        head_dim=128,
-        hidden_dim=3072,
-        vocab_size=2048,
-        max_seq_len=4096,
-        moe_enabled=True,
-        moe_hidden_dim=768,
-        num_experts=64,
-        top_k=8,
-        route_norm=True,
-        score_before_experts=False,
-    )
-    for key, value in overrides.items():
-        setattr(args, key, value)
-    args.__post_init__()
-    return args
-
-
-def qwen3_30b_a3b_args(**overrides) -> Qwen3ModelArgs:
-    args = Qwen3ModelArgs(
-        dim=2048,
-        n_layers=48,
-        n_heads=32,
-        n_kv_heads=4,
-        head_dim=128,
-        hidden_dim=6144,
-        vocab_size=151936,
-        max_seq_len=262144,
-        moe_enabled=True,
-        moe_hidden_dim=768,
-        num_experts=128,
-        top_k=8,
-        route_norm=True,
-        score_before_experts=False,
-    )
-    for key, value in overrides.items():
-        setattr(args, key, value)
-    args.__post_init__()
-    return args
-
-
-def qwen3_235b_a22b_args(**overrides) -> Qwen3ModelArgs:
-    args = Qwen3ModelArgs(
-        dim=4096,
-        n_layers=94,
-        n_heads=64,
-        n_kv_heads=4,
-        head_dim=128,
-        hidden_dim=12288,
-        vocab_size=151936,
-        max_seq_len=4096,
-        rope_theta=5000000.0,
-        moe_enabled=True,
-        moe_hidden_dim=1536,
-        num_experts=128,
-        top_k=8,
-        route_norm=True,
-        score_before_experts=False,
-    )
-    for key, value in overrides.items():
-        setattr(args, key, value)
-    args.__post_init__()
-    return args
-
-
-def precompute_freqs_cos_sin(
-    dim: int,
-    max_seq_len: int,
-    theta: float = 1000000.0,
-) -> torch.Tensor:
-    freq = theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
-    inv_freq = 1.0 / freq
-    t = torch.arange(max_seq_len, dtype=inv_freq.dtype, device=inv_freq.device)
-    freqs = torch.outer(t, inv_freq).float()
-    freqs = torch.cat([freqs, freqs], dim=-1)
-    cos = freqs.cos()
-    sin = freqs.sin()
-    return torch.cat([cos, sin], dim=-1)
-
-
-def reshape_for_broadcast_cos_sin(
-    rope_cache: torch.Tensor,
-    x: torch.Tensor,
-) -> torch.Tensor:
-    bsz, seqlen, _, head_dim = x.shape
-    rope_cache = rope_cache[0:seqlen]
-    assert rope_cache.shape == (seqlen, head_dim * 2)
-    return rope_cache.view(1, seqlen, 1, head_dim * 2).expand(bsz, -1, -1, -1)
-
-
-def _rotate_half(x: torch.Tensor) -> torch.Tensor:
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_emb_cos_sin(
-    xq: torch.Tensor,
-    xk: torch.Tensor,
-    rope_cache: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    head_dim = xq.shape[-1]
-    rope_cache = reshape_for_broadcast_cos_sin(rope_cache, xq)
-    cos = rope_cache[..., :head_dim].to(device=xq.device)
-    sin = rope_cache[..., head_dim:].to(device=xq.device)
-    xq_f = xq.float()
-    xk_f = xk.float()
-    xq_out = (xq_f * cos) + (_rotate_half(xq_f) * sin)
-    xk_out = (xk_f * cos) + (_rotate_half(xk_f) * sin)
-    return xq_out.type_as(xq), xk_out.type_as(xk)
-
-
-def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
-    bs, slen, n_kv_heads, head_dim = x.shape
-    if n_rep == 1:
-        return x
-    return (
-        torch.unsqueeze(x, dim=3)
-        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-    )
-
-
-def _to_activation_device(tensor: torch.Tensor, activation: torch.Tensor) -> torch.Tensor:
-    if tensor.device != activation.device and tensor.device.type == "meta":
-        return tensor.to(activation.device)
-    return tensor
-
-
-def _rms_norm(x: torch.Tensor, norm: nn.RMSNorm) -> torch.Tensor:
-    weight = (
-        _to_activation_device(norm.weight, x)
-        if norm.weight is not None
-        else None
-    )
-    if weight is not None and weight.dtype != x.dtype:
-        weight = weight.to(dtype=x.dtype)
-    return F.rms_norm(x, norm.normalized_shape, weight, norm.eps).to(dtype=x.dtype)
-
-
-def _linear(x: torch.Tensor, linear: nn.Linear) -> torch.Tensor:
-    weight = _to_activation_device(linear.weight, x)
-    bias = (
-        _to_activation_device(linear.bias, x)
-        if linear.bias is not None
-        else None
-    )
-    if weight.dtype != x.dtype:
-        weight = weight.to(dtype=x.dtype)
-    if bias is not None and bias.dtype != x.dtype:
-        bias = bias.to(dtype=x.dtype)
-    return F.linear(x, weight, bias)
-
-
-class Attention(nn.Module):
-    def __init__(self, model_args: Qwen3ModelArgs):
-        super().__init__()
-        self.n_heads = model_args.n_heads
-        self.n_kv_heads = (
-            model_args.n_heads
-            if model_args.n_kv_heads is None
-            else model_args.n_kv_heads
-        )
-        self.n_rep = self.n_heads // self.n_kv_heads
-        self.head_dim = model_args.head_dim
-        self.scale = self.head_dim**-0.5
-
-        self.wq = nn.Linear(
-            model_args.dim, model_args.n_heads * self.head_dim, bias=False
-        )
-        self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wo = nn.Linear(
-            model_args.n_heads * self.head_dim, model_args.dim, bias=False
-        )
-        self.q_norm = nn.RMSNorm(self.head_dim, eps=model_args.norm_eps)
-        self.k_norm = nn.RMSNorm(self.head_dim, eps=model_args.norm_eps)
-        self.sdpa = build_attention(model_args.attn_mask_type)
-
-    def init_weights(self, init_std: float):
-        for linear in (self.wq, self.wk, self.wv):
-            nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std)
-        self.q_norm.reset_parameters()
-        self.k_norm.reset_parameters()
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        freqs_cos_sin: torch.Tensor,
-    ):
-        bs, seqlen, _ = x.shape
-        xq, xk, xv = _linear(x, self.wq), _linear(x, self.wk), _linear(x, self.wv)
-
-        xq = xq.view(bs, seqlen, -1, self.head_dim)
-        xk = xk.view(bs, seqlen, -1, self.head_dim)
-        xv = xv.view(bs, seqlen, -1, self.head_dim)
-
-        xq = _rms_norm(xq, self.q_norm)
-        xk = _rms_norm(xk, self.k_norm)
-        freqs_cos_sin = _to_activation_device(freqs_cos_sin, xq)
-        xq, xk = apply_rotary_emb_cos_sin(xq, xk, freqs_cos_sin)
-
-        keys = repeat_kv(xk, self.n_rep)
-        values = repeat_kv(xv, self.n_rep)
-
-        xq = xq.transpose(1, 2)
-        xk = keys.transpose(1, 2)
-        xv = values.transpose(1, 2)
-
-        output = self.sdpa(xq, xk, xv, scale=self.scale)
-
-        output = output.transpose(1, 2).contiguous()
-        output = output.view(bs, seqlen, -1)
-        return _linear(output, self.wo)
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim: int, hidden_dim: int):
-        super().__init__()
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
-
-    def forward(self, x):
-        return _linear(F.silu(_linear(x, self.w1)) * _linear(x, self.w3), self.w2)
-
-    def init_weights(self, init_std: float):
-        nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
-        for linear in (self.w2, self.w3):
-            nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std)
-
-
-class GroupedExperts(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        num_experts: int,
-        use_grouped_mm: bool,
-    ):
-        super().__init__()
-        self.num_experts = num_experts
-        self.w1 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
-        self.w2 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
-        self.w3 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
-        self.use_grouped_mm = use_grouped_mm
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        num_tokens_per_expert: torch.Tensor,
-    ) -> torch.Tensor:
-        if self.use_grouped_mm:
-            return _run_experts_grouped_mm(
-                self.w1, self.w2, self.w3, x, num_tokens_per_expert
-            )
-        return _run_experts_for_loop(
-            self.w1, self.w2, self.w3, x, num_tokens_per_expert
-        )
-
-    def init_weights(self, init_std: float):
-        nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02)
-        nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std)
-        nn.init.trunc_normal_(self.w3, mean=0.0, std=init_std)
-
-
-def _qwen3_token_dispatch(routed_input, num_tokens_per_expert, axis_name):
-    ep_size = axis_size(axis_name)
-    num_tokens_per_expert_group = all_to_all(
-        num_tokens_per_expert,
-        None,
-        None,
-        axis_name,
-    )
-
-    with torch.no_grad():
-        input_splits = (
-            num_tokens_per_expert.view(ep_size, -1)
-            .sum(dim=1)
-            .to(torch.device("cpu"), non_blocking=True)
-        )
-        output_splits = (
-            num_tokens_per_expert_group.view(ep_size, -1)
-            .sum(dim=1)
-            .to(torch.device("cpu"), non_blocking=False)
-        )
-        input_splits = input_splits.tolist()
-        output_splits = output_splits.tolist()
-
-    with fx_traceback.annotate({"comm_region": "token_dispatch"}):
-        routed_input = all_to_all(
-            routed_input,
-            output_splits,
-            input_splits,
-            axis_name,
-        )
-
-    num_local_experts = num_tokens_per_expert_group.shape[0] // ep_size
-    return (
-        *_permute(
-            routed_input,
-            num_tokens_per_expert_group,
-            ep_size,
-            num_local_experts,
-        ),
-        input_splits,
-        output_splits,
-    )
-
-
-def qwen3_moe_local_mapped_region(
-    x: torch.Tensor,
-    selected_experts_indices: torch.Tensor,
-    top_scores: torch.Tensor,
-    experts_w1: torch.Tensor,
-    experts_w3: torch.Tensor,
-    experts_w2: torch.Tensor,
-    out: torch.Tensor,
-    top_k: int,
-    num_experts: int,
-    score_before_experts: bool,
-    axis_name: str,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    dim = x.shape[-1]
-    ep_size = axis_size(axis_name)
-    if num_experts % ep_size != 0:
-        raise ValueError(
-            f"num_experts ({num_experts}) must be divisible by "
-            f"axis_size({axis_name!r}) ({ep_size})."
-        )
-
-    num_tokens_per_expert = torch.histc(
-        selected_experts_indices.flatten(),
-        bins=num_experts,
-        min=0,
-        max=num_experts,
-    ).view(-1)
-
-    token_indices_experts_sorted = torch.argsort(
-        selected_experts_indices.view(-1), stable=True
-    )
-    top_scores_experts_sorted = top_scores.view(-1)[token_indices_experts_sorted]
-    token_indices_experts_sorted = token_indices_experts_sorted // top_k
-
-    routed_input = x[token_indices_experts_sorted]
-    if score_before_experts:
-        routed_input = (
-            routed_input.to(torch.float32) * top_scores_experts_sorted.reshape(-1, 1)
-        ).to(x.dtype)
-
-    shape = routed_input.shape
-    (
-        input_shape,
-        routed_input,
-        permuted_indices,
-        num_tokens_per_expert_group,
-        input_splits,
-        output_splits,
-    ) = _qwen3_token_dispatch(routed_input, num_tokens_per_expert, axis_name)
-
-    routed_output = _run_experts_grouped_mm(
-        experts_w1,
-        experts_w2,
-        experts_w3,
-        routed_input,
-        num_tokens_per_expert_group,
-    )
-    routed_output = _token_combine(
-        routed_output,
-        input_shape,
-        permuted_indices,
-        input_splits,
-        output_splits,
-        axis_name,
-    )
-
-    torch._check(routed_output.shape[0] == shape[0])
-    if not score_before_experts:
-        routed_output = (
-            routed_output.to(torch.float32) * top_scores_experts_sorted.reshape(-1, 1)
-        ).to(routed_output.dtype)
-
-    out = out.scatter_add(
-        dim=0,
-        index=token_indices_experts_sorted.reshape(-1, 1).expand(-1, dim),
-        src=routed_output,
-    )
-    return out, num_tokens_per_expert
-
-class MoE(nn.Module):
-    def __init__(
-        self,
-        model_args: Qwen3ModelArgs,
-        mesh: DeviceMesh | None = None,
-        axis_name: str | None = None,
-    ):
-        super().__init__()
-        self.mesh = mesh
-        self.axis_name = axis_name or model_args.moe_axis_name
-        self.num_experts = model_args.num_experts
-        self.top_k = model_args.top_k
-        self.route_norm = model_args.route_norm
-        self.route_scale = model_args.route_scale
-        self.score_before_experts = model_args.score_before_experts
-        self.load_balance_coeff = model_args.load_balance_coeff
-
-        self.router = nn.Linear(model_args.dim, model_args.num_experts, bias=False)
-        self.experts = GroupedExperts(
-            dim=model_args.dim,
-            hidden_dim=model_args.moe_hidden_dim,
-            num_experts=model_args.num_experts,
-            use_grouped_mm=model_args.use_grouped_mm,
-        )
-        self.register_buffer(
-            "expert_bias",
-            torch.zeros(model_args.num_experts, dtype=torch.float32),
-            persistent=self.load_balance_coeff is not None,
-        )
-        self.register_buffer(
-            "tokens_per_expert",
-            torch.zeros(model_args.num_experts, dtype=torch.float32),
-            persistent=False,
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        bs, slen, dim = x.shape
-        x = x.view(-1, dim)
-        experts_w1, experts_w2, experts_w3 = self.experts.parameters()
-        experts_w1 = _to_activation_device(experts_w1, x)
-        experts_w2 = _to_activation_device(experts_w2, x)
-        experts_w3 = _to_activation_device(experts_w3, x)
-
-        scores = F.linear(
-            x.to(torch.float32),
-            _to_activation_device(self.router.weight, x).to(torch.float32),
-            None,
-        )
-        scores = F.softmax(scores, dim=-1)
-        expert_bias = _to_activation_device(self.expert_bias, scores)
-        scores_for_choice = (
-            scores + expert_bias
-            if self.load_balance_coeff is not None
-            else scores
-        )
-        _, selected_experts_indices = torch.topk(
-            scores_for_choice,
-            k=self.top_k,
-            dim=-1,
-            sorted=False,
-        )
-
-        top_scores = scores.gather(dim=-1, index=selected_experts_indices)
-        if self.route_norm:
-            denominator = top_scores.sum(dim=-1, keepdim=True) + 1e-20
-            top_scores = top_scores / denominator
-        top_scores = top_scores * self.route_scale
-
-        # Qwen3 MoE has no shared expert path, but keeping the initial output
-        # differentiably tied to x matches the DSv3 local_map autograd shape.
-        out = x * 0
-        out, num_tokens_per_expert = local_map(
-            qwen3_moe_local_mapped_region,
-            out_placements=(
-                (Shard(0), Shard(0)),
-                (Partial(reduce_op="sum"), Partial(reduce_op="sum")),
-            ),
-            in_placements=(
-                (Shard(0), Shard(0)),
-                (Shard(0), Shard(0)),
-                (Shard(0), Shard(0)),
-                (Replicate(), Shard(0)),
-                (Replicate(), Shard(0)),
-                (Replicate(), Shard(0)),
-                (Shard(0), Shard(0)),
-                None,
-                None,
-                None,
-                None,
-            ),
-            redistribute_inputs=True,
-            in_grad_placements=None,
-            device_mesh=self.mesh,
-        )(
-            x,
-            selected_experts_indices,
-            top_scores,
-            experts_w1,
-            experts_w3,
-            experts_w2,
-            out,
-            self.top_k,
-            self.num_experts,
-            self.score_before_experts,
-            self.axis_name,
-        )
-        # This counter is only used for runtime load-balance diagnostics. During
-        # AutoParallel graph capture the module buffers are fake/meta tensors
-        # while the traced local_map output can be CUDA-fake, and recording this
-        # mutation is not needed for the solved training graph.
-        if not torch.compiler.is_compiling():
-            with torch.no_grad():
-                self.tokens_per_expert.add_(num_tokens_per_expert)  # type: ignore[operator]
-        return out.reshape(bs, slen, dim)
-
-    def init_weights(
-        self,
-        init_std: float,
-        buffer_device: torch.device,
-    ):
-        nn.init.trunc_normal_(self.router.weight, mean=0.0, std=init_std)
-        self.experts.init_weights(init_std)
-        with torch.device(buffer_device):
-            self.tokens_per_expert.zero_()  # type: ignore[operator]
-            self.expert_bias.zero_()  # type: ignore[operator]
-
-
-class TransformerBlock(nn.Module):
-    def __init__(
-        self,
-        layer_id: int,
-        model_args: Qwen3ModelArgs,
-        mesh: DeviceMesh | None = None,
-        moe_axis_name: str | None = None,
-    ):
-        super().__init__()
-        self.attention = Attention(model_args)
-        self.moe_enabled = model_args.moe_enabled
-        if self.moe_enabled:
-            self.moe = MoE(model_args, mesh=mesh, axis_name=moe_axis_name)
-        else:
-            self.feed_forward = FeedForward(
-                dim=model_args.dim,
-                hidden_dim=model_args.hidden_dim,
-            )
-        self.attention_norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
-        self.ffn_norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
-
-        if model_args.depth_init:
-            self.weight_init_std = 0.02 / math.sqrt(2 * (layer_id + 1))
-        else:
-            self.weight_init_std = 0.02 / math.sqrt(2 * model_args.n_layers)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        freqs_cos_sin: torch.Tensor,
-    ):
-        h = x + self.attention(_rms_norm(x, self.attention_norm), freqs_cos_sin)
-        if self.moe_enabled:
-            out = h + self.moe(_rms_norm(h, self.ffn_norm))
-        else:
-            out = h + self.feed_forward(_rms_norm(h, self.ffn_norm))
-        return out
-
-    def init_weights(self, buffer_device: torch.device):
-        for norm in (self.attention_norm, self.ffn_norm):
-            norm.reset_parameters()
-        self.attention.init_weights(self.weight_init_std)
-        if self.moe_enabled:
-            self.moe.init_weights(self.weight_init_std, buffer_device)
-        else:
-            self.feed_forward.init_weights(self.weight_init_std)
-
-
-class Transformer(nn.Module):
-    def __init__(
-        self,
-        model_args: Qwen3ModelArgs,
-        mesh: DeviceMesh | None = None,
-        moe_axis_name: str | None = None,
-    ):
-        super().__init__()
-        self.model_args = model_args
-        self.vocab_size = model_args.vocab_size
-        self.n_layers = model_args.n_layers
-        self.eos_id = model_args.eos_id
-        self.enable_weight_tying = model_args.enable_weight_tying
-        self.mesh = mesh
-        self.moe_axis_name = moe_axis_name or model_args.moe_axis_name
-
-        self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
-        self.register_buffer(
-            "freqs_cos_sin",
-            self._precompute_freqs_cos_sin(),
-            persistent=True,
-        )
-
-        self.layers = torch.nn.ModuleDict()
-        for layer_id in range(model_args.n_layers):
-            self.layers[str(layer_id)] = TransformerBlock(
-                layer_id,
-                model_args,
-                mesh=mesh,
-                moe_axis_name=self.moe_axis_name,
-            )
-        self.norm = nn.RMSNorm(model_args.dim, eps=model_args.norm_eps)
-        self.lm_head = nn.Linear(model_args.dim, model_args.vocab_size, bias=False)
-
-        if self.enable_weight_tying:
-            self.tok_embeddings.weight = self.lm_head.weight
-
-    def init_weights(
-        self,
-        buffer_device: Optional[torch.device] = None,
-        seed: int | None = None,
-    ):
-        if seed is not None:
-            torch.manual_seed(seed)
-
-        if self.enable_weight_tying:
-            self.tok_embeddings.weight = self.lm_head.weight
-
-        buffer_device = buffer_device or self.freqs_cos_sin.device  # type: ignore[assignment]
-        with torch.device(buffer_device):  # type: ignore[arg-type]
-            self.freqs_cos_sin = self._precompute_freqs_cos_sin()
-
-        if not self.enable_weight_tying and self.tok_embeddings is not None:
-            nn.init.normal_(self.tok_embeddings.weight)
-        for layer in self.layers.values():
-            if layer is not None:
-                layer.init_weights(buffer_device)  # type: ignore[operator]
-        if self.norm is not None:
-            self.norm.reset_parameters()
-
-        final_out_std = self.model_args.dim**-0.5
-        cutoff_factor = 3
-        if self.lm_head is not None:
-            nn.init.trunc_normal_(
-                self.lm_head.weight,
-                mean=0.0,
-                std=final_out_std,
-                a=-cutoff_factor * final_out_std,
-                b=cutoff_factor * final_out_std,
-            )
-
-        if self.enable_weight_tying:
-            self.tok_embeddings.weight = self.lm_head.weight
-
-    def _precompute_freqs_cos_sin(self) -> torch.Tensor:
-        return precompute_freqs_cos_sin(
-            self.model_args.head_dim,
-            self.model_args.max_seq_len,
-            self.model_args.rope_theta,
-        )
-
-    def _token_embedding(self, tokens: torch.Tensor) -> torch.Tensor:
-        weight = self.tok_embeddings.weight
-        if weight.device != tokens.device and weight.device.type == "meta":
-            weight = weight.to(tokens.device)
-        return F.embedding(tokens, weight)
-
-    def forward(self, tokens: torch.Tensor, input_batch: Optional[torch.Tensor] = None):
-        h = self._token_embedding(tokens) if self.tok_embeddings is not None else tokens
-
-        for layer in self.layers.values():
-            h = layer(h, self.freqs_cos_sin)
-
-        h = _rms_norm(h, self.norm) if self.norm is not None else h
-        output = _linear(h, self.lm_head) if self.lm_head is not None else h
-        return output
-
-
-_MODULE_FQN = "module_fqn"
-
-
-def _annotate_once(fn: Callable, meta: dict):
-    if getattr(fn, "_graph_trainer_annotated", False):
-        return fn
-    wrapped = fx_traceback.annotate_fn(meta)(fn)
-    setattr(wrapped, "_graph_trainer_annotated", True)
-    return wrapped
-
-
-def _annotate_module_fqns(model: nn.Module) -> None:
-    for fqn, submodule in model.named_modules():
-        if fqn:
-            submodule.forward = _annotate_once(
-                submodule.forward,
-                {_MODULE_FQN: fqn},
-            )
-
-
-def annotate_qwen3_for_graph_trainer(model: Transformer) -> None:
-    """Attach graph_trainer-compatible FX annotations to AP's Qwen3 model."""
-    global qwen3_moe_local_mapped_region
-
-    qwen3_moe_local_mapped_region = _annotate_once(
-        qwen3_moe_local_mapped_region,
-        {"EP": "compute"},
-    )
-    MoE.forward = _annotate_once(  # type: ignore[method-assign]
-        MoE.forward,
-        {"EP": "compute"},
-    )
-    _annotate_module_fqns(model)
diff --git a/autoparallel/serialization.py b/autoparallel/serialization.py
index 4c9167d4..c9474746 100644
--- a/autoparallel/serialization.py
+++ b/autoparallel/serialization.py
@@ -193,8 +193,7 @@ def save_optimizer(opt, path):
         "dv_costs_keys": dv_costs_keys,
         "dv_costs_vals": dv_costs_vals,
         "cluster_links_node_by_name": {
-            opt.nodes[c].name: opt.nodes[r].name
-            for c, r in opt.cluster_links.items()
+            opt.nodes[c].name: opt.nodes[r].name for c, r in opt.cluster_links.items()
         },
         "constraint_log": opt._constraint_log,
         "selected_keys_by_name": selected_keys_by_name,
diff --git a/examples/_bench_3d_cert.py b/examples/_bench_3d_cert.py
deleted file mode 100644
index 956489cb..00000000
--- a/examples/_bench_3d_cert.py
+++ /dev/null
@@ -1,154 +0,0 @@
-"""3D optimality certificate for the merged solver on full LLaMA3-1B.
-
-The 3D ILP has ~8M binary variables; the exact CBC solve (and even CBC's LP
-relaxation) is impractical (a 2.6 GB MPS file; CBC simplex runs for hours). The
-LP relaxation is empirically integral for this problem (verified on 2D, where it
-equals the exact optimum), so its objective is a tight lower bound on the ILP
-optimum. We solve that LP with HiGHS (scipy.optimize.linprog), which handles the
-8M-variable sparse LP in minutes, then compare to the approximate solvers.
-
-One full PuLP build feeds: the HiGHS LP lower bound (optimality reference), and
-the prune+dp / merged approximate objectives. Reports the certified gaps. Env:
-MESH, SEQLEN.
-"""
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import numpy as np
-import pulp
-import scipy.sparse as sp
-import torch
-from scipy.optimize import linprog
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-
-def log(m=""):
-    print(m, flush=True)
-
-
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-
-
-def model_fn():
-    args = TransformerModelArgs(
-        dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5,
-        multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN)
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-def lp_lower_bound_highs(opt):
-    """Solve the LP relaxation (binaries -> [0,1]) of opt.prob with HiGHS and
-    return its objective: a tight lower bound on the ILP optimum."""
-    variables = opt.prob.variables()
-    idx = {v.name: i for i, v in enumerate(variables)}
-    n = len(variables)
-    c = np.zeros(n)
-    for v, coeff in opt.prob.objective.items():
-        c[idx[v.name]] += coeff
-    rows_eq, cols_eq, data_eq, b_eq = [], [], [], []
-    rows_ub, cols_ub, data_ub, b_ub = [], [], [], []
-    r_eq = r_ub = 0
-    for con in opt.prob.constraints.values():
-        rhs = -con.constant
-        items = list(con.items())
-        if con.sense == pulp.LpConstraintEQ:
-            for v, coeff in items:
-                rows_eq.append(r_eq); cols_eq.append(idx[v.name]); data_eq.append(coeff)
-            b_eq.append(rhs); r_eq += 1
-        else:  # LE: a<=b ; GE: a>=b -> -a<=-b
-            sign = 1.0 if con.sense == pulp.LpConstraintLE else -1.0
-            for v, coeff in items:
-                rows_ub.append(r_ub); cols_ub.append(idx[v.name]); data_ub.append(sign * coeff)
-            b_ub.append(sign * rhs); r_ub += 1
-    A_eq = sp.csr_matrix((data_eq, (rows_eq, cols_eq)), shape=(r_eq, n)) if r_eq else None
-    A_ub = sp.csr_matrix((data_ub, (rows_ub, cols_ub)), shape=(r_ub, n)) if r_ub else None
-    res = linprog(c, A_ub=A_ub, b_ub=(b_ub or None), A_eq=A_eq, b_eq=(b_eq or None),
-                  bounds=(0, 1), method="highs")
-    if not res.success:
-        raise RuntimeError(f"HiGHS LP failed: {res.message}")
-    return res.fun, n, r_eq + r_ub
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-x = (Shard(0),) + (Replicate(),) * (ndim - 1)
-out = (Shard(0), Shard(2)) if ndim == 2 else x
-
-log(f"=== 3D cert (HiGHS): LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===")
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
-autop.__enter__()
-autop.add_parameter_memory_constraint(low=None, high=None)
-autop.add_input_constraints([x])
-autop.add_output_constraints([out])
-opt = autop.sharding_optimizer
-opt._set_objective()
-opt._apply_memory_constraint()
-log(f"[build] {time.perf_counter()-t:.1f}s  decision_vars={len(opt.decision_vars)}  "
-    f"pulp_vars={len(opt.pulp_variables)}  constraints={len(opt.prob.constraints)}")
-
-# prune+dp (approx, no annotation) on the same problem.
-t = time.perf_counter()
-ApproximateShardingSolver(opt).get_solution(verbose=False)
-prune_dp = opt.profile["approximate"]["objective"]
-log(f"[prune+dp]  approx {time.perf_counter()-t:.1f}s  objective={prune_dp:.1f}")
-
-# merged (prune+dp+annotated): propagate the TP plan, then approx-solve.
-cp = (None,) * (ndim - 1) + (Shard(0),)
-rp = (None,) * (ndim - 1) + (Shard(1),)
-for proj in ["wq", "wk", "wv"]:
-    autop.annotate_parameter(f"layers.*.attention.{proj}.weight", cp)
-autop.annotate_parameter("layers.*.attention.wo.weight", rp)
-for proj in ["w1", "w3"]:
-    autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", cp)
-autop.annotate_parameter("layers.*.feed_forward.w2.weight", rp)
-autop.propagate_annotations(verbose=False, method="fix")
-t = time.perf_counter()
-ApproximateShardingSolver(opt).get_solution(verbose=False)
-merged = opt.profile["approximate"]["objective"]
-log(f"[merged]    approx {time.perf_counter()-t:.1f}s  objective={merged:.1f}")
-
-# LP relaxation lower bound via HiGHS = optimality reference.
-t = time.perf_counter()
-lb, nvar, ncon = lp_lower_bound_highs(opt)
-log(f"[LP-bound]  HiGHS {time.perf_counter()-t:.1f}s  lower_bound={lb:.1f}  "
-    f"(vars={nvar} cons={ncon})")
-
-log("")
-for name, obj in [("prune+dp", prune_dp), ("merged", merged)]:
-    gap = 100 * (obj - lb) / lb
-    log(f"=== 3D {name:<9} gap = {gap:+.2f}%  (obj {obj:.1f} vs LP lower bound "
-        f"{lb:.1f})  <=10%: {abs(gap)<=10}  <=5%: {abs(gap)<=5} ===")
diff --git a/examples/_bench_anno.py b/examples/_bench_anno.py
deleted file mode 100644
index 45e546ff..00000000
--- a/examples/_bench_anno.py
+++ /dev/null
@@ -1,116 +0,0 @@
-"""prune+dp+annotation (the full joint config) vs prune+dp alone, compared to a
-known optimum/LP lower bound. Lite build + optional TP-plan annotation + approx.
-Env: MODEL, MESH, SEQLEN, LP_LB."""
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-MODEL = os.environ.get("MODEL", "70b")
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
-LP_LB = float(os.environ.get("LP_LB", "0"))
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-_CFG = {
-    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
-    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
-    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
-    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
-}
-
-
-def model_fn():
-    args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size,
-                                max_seq_len=SEQLEN, **_CFG[MODEL])
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-COLUMN_PARALLEL = (None,) * (ndim - 1) + (Shard(0),)
-ROW_PARALLEL = (None,) * (ndim - 1) + (Shard(1),)
-
-
-def annotate_tp_plan(autop):
-    for proj in ["wq", "wk", "wv"]:
-        autop.annotate_parameter(f"layers.*.attention.{proj}.weight", COLUMN_PARALLEL)
-    autop.annotate_parameter("layers.*.attention.wo.weight", ROW_PARALLEL)
-    for proj in ["w1", "w3"]:
-        autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", COLUMN_PARALLEL)
-    autop.annotate_parameter("layers.*.feed_forward.w2.weight", ROW_PARALLEL)
-
-
-def constrain(autop):
-    x = (Shard(0),) + (Replicate(),) * (ndim - 1)
-    out = (Shard(0), Shard(2)) if ndim == 2 else x
-    autop.add_parameter_memory_constraint(low=None, high=None)
-    autop.add_input_constraints([x])
-    autop.add_output_constraints([out])
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-print(f"### anno MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ###", flush=True)
-
-
-def gap(o):
-    return 100 * (o - LP_LB) / LP_LB if LP_LB else float("nan")
-
-
-# prune+dp (no annotation)
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
-autop.__enter__()
-constrain(autop)
-build_s = time.perf_counter() - t
-opt = autop.sharding_optimizer
-t = time.perf_counter()
-ApproximateShardingSolver(opt).get_solution(verbose=False)
-dp_s = time.perf_counter() - t
-obj_dp = opt.profile["approximate"]["objective"]
-print(f"[dp]     build={build_s:.1f}s approx={dp_s:.1f}s obj={obj_dp:.1f} gap={gap(obj_dp):+.2f}%", flush=True)
-
-# + annotation
-t = time.perf_counter()
-annotate_tp_plan(autop)
-prop = autop.propagate_annotations(verbose=False, method="fix")
-prop_s = time.perf_counter() - t
-t = time.perf_counter()
-ApproximateShardingSolver(opt).get_solution(verbose=False)
-ann_s = time.perf_counter() - t
-obj_ann = opt.profile["approximate"]["objective"]
-print(f"[dp+anno] build={build_s:.1f}s propagate={prop_s:.1f}s approx={ann_s:.1f}s "
-      f"total={build_s+prop_s+ann_s:.1f}s obj={obj_ann:.1f} gap={gap(obj_ann):+.2f}% "
-      f"(pinned {prop.nodes_determined} nodes)", flush=True)
-print(f"[RESULT] MODEL={MODEL} mesh={MESH_SHAPE} dp_gap={gap(obj_dp):+.2f}% "
-      f"dp+anno_gap={gap(obj_ann):+.2f}% dp+anno_total={build_s+prop_s+ann_s:.1f}s", flush=True)
diff --git a/examples/_bench_approx.py b/examples/_bench_approx.py
deleted file mode 100644
index 272c47aa..00000000
--- a/examples/_bench_approx.py
+++ /dev/null
@@ -1,166 +0,0 @@
-"""Benchmark approximate solver vs ILP: objective + solve time.
-
-Setting: LLaMA3 (1b default) on a 2D (dp, tp) mesh with vocab parallelism and
-the canonical example_llama3 constraints. Both solvers run on the SAME built
-optimizer: approx first (it only fills varValues/objective via an idempotent
-_set_objective), then a fresh CBC solve for the ILP. This avoids building the
-(expensive) strategy graph twice.
-
-Env knobs: MODEL_TYPE (1b|8b), MESH ("8,8"), N_LAYERS (0=default), SEQLEN,
-REPEATED (1|0), RUN_ILP (1|0), ILP_TIMEOUT (seconds, 0=unlimited).
-"""
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import pulp
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-_alog = logging.getLogger("autoparallel.approximate_sharding")
-_alog.setLevel(logging.INFO)
-_alog.addHandler(logging.StreamHandler())
-
-
-def log(msg):
-    print(msg, flush=True)
-
-
-_PATCHES = [
-    patch("torch.cuda.device_count", lambda: 8),
-    patch("torch.cuda.get_device_name", lambda *a, **k: "H100"),
-    patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)),
-    patch(
-        "torch.cuda.get_device_properties",
-        lambda *a, **k: type(
-            "P", (), {"major": 9, "minor": 0, "name": "H100",
-                      "total_memory": 80 * 1024**3, "multi_processor_count": 132}
-        )(),
-    ),
-]
-for p in _PATCHES:
-    p.start()
-
-MODEL_TYPE = os.environ.get("MODEL_TYPE", "1b")
-N_LAYERS = int(os.environ.get("N_LAYERS", "0"))
-SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4)))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
-REPEATED = os.environ.get("REPEATED", "1") == "1"
-RUN_ILP = os.environ.get("RUN_ILP", "1") == "1"
-LP_BOUND = os.environ.get("LP_BOUND", "1") == "1"
-ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "1200"))
-
-world_size = 1
-for d in MESH_SHAPE:
-    world_size *= d
-
-_NAMES = {1: ("dp",), 2: ("dp", "tp"), 3: ("dp", "cp", "tp"),
-          4: ("dp", "cp", "tp", "ep")}
-mesh_names = _NAMES[len(MESH_SHAPE)]
-fake_store = FakeStore()
-torch.distributed.init_process_group("fake", store=fake_store, rank=0, world_size=world_size)
-mesh = torch.distributed.device_mesh.init_device_mesh(
-    "cuda", MESH_SHAPE, mesh_dim_names=mesh_names
-)
-
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-seqlen = SEQLEN
-
-
-def model_fn():
-    if MODEL_TYPE == "1b":
-        args = TransformerModelArgs(
-            dim=2048, n_layers=16, n_heads=32, n_kv_heads=8,
-            ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000,
-            vocab_size=vocab_size, max_seq_len=seqlen,
-        )
-    elif MODEL_TYPE == "8b":
-        args = TransformerModelArgs(
-            dim=4096, n_layers=32, n_heads=32, n_kv_heads=8,
-            ffn_dim_multiplier=1.3, multiple_of=1024, rope_theta=500000,
-            vocab_size=vocab_size, max_seq_len=seqlen,
-        )
-    else:
-        raise ValueError(MODEL_TYPE)
-    if N_LAYERS:
-        args.n_layers = N_LAYERS
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda")
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-
-log(f"model={MODEL_TYPE} layers={N_LAYERS or 'default'} mesh={MESH_SHAPE} "
-    f"world={world_size} seqlen={seqlen} repeated_subgraphs={REPEATED} "
-    f"ilp_timeout={ILP_TIMEOUT}")
-
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=REPEATED)
-autop.__enter__()
-ndim = mesh.ndim
-x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1)
-# vocab-parallel output only defined for 2D (matches example_llama3); otherwise
-# constrain the output like the input.
-out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding
-autop.add_parameter_memory_constraint(low=None, high=None)
-autop.add_input_constraints([x_sharding])
-autop.add_output_constraints([out_sharding])
-opt = autop.sharding_optimizer
-log(f"[build] optimizer ready in {time.perf_counter() - t:.2f}s  "
-    f"vars={len(opt.pulp_variables)} constraints={len(opt.prob.constraints)} "
-    f"nodes={len(opt.nodes)}")
-
-# ---- APPROX ----
-t = time.perf_counter()
-approx = ApproximateShardingSolver(opt)
-approx.get_solution(verbose=True)
-ap_t = time.perf_counter() - t
-ap_obj = pulp.value(opt.prob.objective)
-prof = opt.profile.get("approximate", {})
-log(f"\n[APPROX] objective={ap_obj:.2f}  solve_time={ap_t:.3f}s")
-log(f"         groups={prof.get('groups')} sweeps={prof.get('sweeps')} "
-    f"build={prof.get('build_s'):.3f}s search={prof.get('solve_s'):.3f}s "
-    f"writeback={ap_t - prof.get('build_s', 0) - prof.get('solve_s', 0):.3f}s")
-
-# ---- LP relaxation lower bound (certified suboptimality upper bound) ----
-if LP_BOUND:
-    lb_res = opt.get_lower_bound(verbose=False)
-    lb = lb_res.objective
-    if lb and lb > 0:
-        cert = (ap_obj - lb) / lb
-        log(f"\n[LP-bound] lower_bound={lb:.2f}  solve={lb_res.solve_s:.2f}s  "
-            f"=> approx within {cert*100:.2f}% of optimum (certified upper bound)")
-
-# ---- ILP (fresh CBC solve on the same problem) ----
-if RUN_ILP:
-    opt._set_objective()  # idempotent: objective already populated by approx
-    kw = {"msg": True}
-    if ILP_TIMEOUT > 0:
-        kw["timeLimit"] = ILP_TIMEOUT
-    log(f"\n[ILP] solving with CBC (timeLimit={ILP_TIMEOUT or 'none'})...")
-    t = time.perf_counter()
-    opt.prob.solve(pulp.PULP_CBC_CMD(**kw))
-    ilp_t = time.perf_counter() - t
-    ilp_obj = pulp.value(opt.prob.objective)
-    status = pulp.LpStatus[opt.prob.status]
-    log(f"[ILP]    objective={ilp_obj:.2f}  solve_time={ilp_t:.3f}s  status={status}")
-
-    gap = (ap_obj - ilp_obj) / ilp_obj
-    log(f"\n=== objective gap = {gap*100:+.2f}%   solve speedup = {ilp_t/ap_t:.1f}x ===")
-    log(f"=== within 20% ? {abs(gap) <= 0.20}   (ILP status: {status}) ===")
diff --git a/examples/_bench_approx_diag.py b/examples/_bench_approx_diag.py
deleted file mode 100644
index 25de4d85..00000000
--- a/examples/_bench_approx_diag.py
+++ /dev/null
@@ -1,173 +0,0 @@
-"""Diagnose the bare approx gap: is the factor graph FAITHFUL (scores the true
-optimum correctly -> solver is at fault) or UNFAITHFUL (drops cost -> model is at
-fault), and is the optimum REPRESENTABLE in the group choices (pruning)?
-
-Builds the ILP, solves it exactly with CBC, then checks whether the approx's own
-machinery (total_objective + factor graph) reproduces the CBC optimum, and where
-the approx's own solution differs. Env: MODEL, MESH, SEQLEN."""
-import logging
-import os
-import time
-from collections import defaultdict
-from unittest.mock import patch
-
-import pulp
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-MODEL = os.environ.get("MODEL", "1b")
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-_CFG = {
-    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
-    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
-    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
-    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
-}
-
-
-def model_fn():
-    args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size,
-                                max_seq_len=SEQLEN, **_CFG[MODEL])
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-def constrain(autop):
-    x = (Shard(0),) + (Replicate(),) * (ndim - 1)
-    out = (Shard(0), Shard(2)) if ndim == 2 else x
-    autop.add_parameter_memory_constraint(low=None, high=None)
-    autop.add_input_constraints([x])
-    autop.add_output_constraints([out])
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-print(f"### diag MODEL={MODEL} mesh={MESH_SHAPE}{names} ###", flush=True)
-
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
-autop.__enter__()
-constrain(autop)
-opt = autop.sharding_optimizer
-print(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True)
-
-opt._set_objective()
-opt._apply_memory_constraint()
-t = time.perf_counter()
-opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, options=["preprocess off"]))
-obj_cbc = pulp.value(opt.prob.objective)
-print(f"[cbc] solve={time.perf_counter()-t:.1f}s obj={obj_cbc:.1f} status={pulp.LpStatus[opt.prob.status]}", flush=True)
-
-# CBC per-(root)node chosen out_idx
-cbc_out = {}
-for key, var in opt.pulp_variables.items():
-    v = var.varValue
-    if v is not None and v > 0.5:
-        cbc_out[key[0]] = key[2]
-
-approx = ApproximateShardingSolver(opt)
-approx._build_problem()
-approx._build_factors()
-
-# (A) FAITHFULNESS: exact objective of the CBC solution via the approx machinery.
-approx.cur_out = dict(cbc_out)
-e_cbc_total = approx.total_objective()
-print(f"[faithful] approx.total_objective(CBC soln) = {e_cbc_total:.1f}  "
-      f"(CBC obj {obj_cbc:.1f}; match={abs(e_cbc_total-obj_cbc)<1.0})", flush=True)
-
-# (B) REPRESENTABILITY: can the group choices express the CBC solution?
-cbc_full = dict(cbc_out)
-for copy_idx, root_idx in opt.cluster_links.items():
-    if root_idx in cbc_out:
-        cbc_full[copy_idx] = cbc_out[root_idx]
-unrep = []
-cbc_group_choice = {}
-for gid, g in enumerate(approx.groups):
-    found = None
-    for ci, choice in enumerate(g.choices):
-        if all(cbc_full.get(m) == o for m, o in choice.items()):
-            found = ci
-            break
-    if found is None:
-        unrep.append(gid)
-    else:
-        cbc_group_choice[gid] = found
-print(f"[representable] groups={len(approx.groups)} "
-      f"with_no_matching_choice={len(unrep)}", flush=True)
-
-# (C) factor-graph energy of the CBC solution (if representable)
-if not unrep:
-    for gid, ci in cbc_group_choice.items():
-        approx._set_group(gid, ci)
-    fge = approx._fast_total_energy()
-    print(f"[fg-energy] _fast_total_energy(CBC soln) = {fge:.1f} "
-          f"(match CBC {abs(fge-obj_cbc)<1.0})", flush=True)
-
-# (D) run the normal approx, localize where it differs from CBC
-approx2 = ApproximateShardingSolver(opt)
-approx2.get_solution(verbose=False)
-obj_approx = opt.profile["approximate"]["objective"]
-ax_out = dict(approx2.cur_out)
-print(f"[approx] obj={obj_approx:.1f} gap={100*(obj_approx-obj_cbc)/obj_cbc:+.2f}%", flush=True)
-
-# per-node exact cost under each assignment (cost_bearing nodes), to localize gap
-def node_cost(solver, out_map, v):
-    o = out_map[v]
-    node = opt.nodes[v]
-    strat = opt.strats[node].strategies[o]
-    prod = solver._arg_prod.get(v, {})
-    c = 0.0
-    for argi in range(len(strat.redistribute_cost)):
-        p = prod.get(argi)
-        inp = out_map[p] if (p is not None and p in out_map) else 0
-        key = (v, argi, o, inp)
-        dv = opt.decision_vars.get(key)
-        if dv is None:
-            return None
-        c += dv.cost
-    return solver.node_mult[v] * c
-
-diffs = []
-for v in approx2.cost_bearing:
-    if cbc_out.get(v) != ax_out.get(v):
-        c_cbc = node_cost(approx2, cbc_out, v)
-        c_ax = node_cost(approx2, ax_out, v)
-        if c_cbc is not None and c_ax is not None:
-            diffs.append((c_ax - c_cbc, v, opt.nodes[v].name, cbc_out.get(v), ax_out.get(v)))
-diffs.sort(reverse=True)
-print(f"[localize] {len(diffs)} cost-bearing nodes differ; top contributors (approx-cbc):", flush=True)
-for d, v, name, oc, oa in diffs[:15]:
-    print(f"    +{d:10.1f}  node={name[:40]:40s} cbc_out={oc} approx_out={oa}", flush=True)
-tot = sum(d for d, *_ in diffs)
-print(f"[localize] total node-cost diff over differing nodes = {tot:.1f} "
-      f"(gap = {obj_approx-obj_cbc:.1f})", flush=True)
diff --git a/examples/_bench_approx_ils.py b/examples/_bench_approx_ils.py
deleted file mode 100644
index d6e1b437..00000000
--- a/examples/_bench_approx_ils.py
+++ /dev/null
@@ -1,136 +0,0 @@
-"""Diagnose whether the approx solver's objective is stuck in a local-optimum
-basin that a stronger search escapes. Build once, run the stock BP+localsearch,
-then run iterated local search (perturb a random set of groups, re-optimize,
-keep best) for a time budget. If ILS beats the stock objective meaningfully, the
-gap is a move-set/init weakness (and the LP bound is ~reachable); if not, 607260
-is robust. Env: MODEL, MESH, SEQLEN, LP_LB, ILS_S."""
-import logging
-import os
-import random
-import time
-from unittest.mock import patch
-
-import numpy as np
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-MODEL = os.environ.get("MODEL", "70b")
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
-LP_LB = float(os.environ.get("LP_LB", "0"))
-ILS_S = float(os.environ.get("ILS_S", "180"))
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-_CFG = {
-    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
-    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
-    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
-}
-
-
-def model_fn():
-    args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size,
-                                max_seq_len=SEQLEN, **_CFG[MODEL])
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-print(f"### ILS MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ils_s={ILS_S} ###", flush=True)
-
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
-autop.__enter__()
-x = (Shard(0),) + (Replicate(),) * (ndim - 1)
-out = (Shard(0), Shard(2)) if ndim == 2 else x
-autop.add_parameter_memory_constraint(low=None, high=None)
-autop.add_input_constraints([x])
-autop.add_output_constraints([out])
-opt = autop.sharding_optimizer
-print(f"[build] lite_build={time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True)
-
-
-def gap(o):
-    return 100 * (o - LP_LB) / LP_LB if LP_LB else float("nan")
-
-
-s = ApproximateShardingSolver(opt)
-s._build_problem()
-s._build_factors()
-G = len(s.groups)
-domains = [g.domain for g in s.groups]
-multi = [d for d in domains if d > 1]
-edges = len(s.C)
-print(f"[graph] groups={G} multi_choice_groups={len(multi)} "
-      f"max_domain={max(domains)} sum_domain={sum(domains)} pair_edges={edges}", flush=True)
-
-# Stock solve (BP + local search), mirrors _solve's BP candidate.
-deadline = time.perf_counter() + 1e9
-s._belief_propagation()
-s._memory_repair()
-s._coordinate_descent(deadline)
-s._star_block_search(deadline)
-stock = s._fast_total_energy()
-best = stock
-best_snap = [g.current for g in s.groups]
-print(f"[stock] bp+cd+star energy={stock:.1f} gap={gap(stock):+.2f}%", flush=True)
-
-# Iterated local search: perturb k random multi-choice groups, re-optimize, keep best.
-rng = random.Random(0)
-multi_gids = [g for g in range(G) if s.groups[g].domain > 1]
-t0 = time.perf_counter()
-iters = 0
-accepts = 0
-while time.perf_counter() - t0 < ILS_S:
-    iters += 1
-    # restore best, then kick
-    for gid, ci in enumerate(best_snap):
-        s._set_group(gid, ci)
-    k = rng.randint(1, max(2, len(multi_gids) // 10))
-    for gid in rng.sample(multi_gids, min(k, len(multi_gids))):
-        s._set_group(gid, rng.randrange(s.groups[gid].domain))
-    s._memory_repair()
-    s._coordinate_descent(deadline)
-    s._star_block_search(deadline)
-    e = s._fast_total_energy()
-    if e < best - 1e-6:
-        best = e
-        best_snap = [g.current for g in s.groups]
-        accepts += 1
-        print(f"[ils] iter={iters} NEW BEST energy={best:.1f} gap={gap(best):+.2f}% "
-              f"(k={k})", flush=True)
-
-for gid, ci in enumerate(best_snap):
-    s._set_group(gid, ci)
-exact = s._write_back()
-print(f"[ILS done] iters={iters} accepts={accepts} stock={stock:.1f} "
-      f"best={best:.1f} exact_obj={exact:.1f} gap={gap(exact):+.2f}% "
-      f"(improvement vs stock = {100*(stock-best)/stock:.2f}%)", flush=True)
diff --git a/examples/_bench_approx_sweep.py b/examples/_bench_approx_sweep.py
deleted file mode 100644
index 3d73a070..00000000
--- a/examples/_bench_approx_sweep.py
+++ /dev/null
@@ -1,106 +0,0 @@
-"""Build one model (lite) once, then run ApproximateShardingSolver under several
-hyperparameter configs to see whether the objective gap (vs a known LP lower
-bound) is closable by tuning (candidate pruning / BP iters / time / local search)
-or is structural. Env: MODEL, MESH, SEQLEN, LP_LB (reference lower bound)."""
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-MODEL = os.environ.get("MODEL", "70b")
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
-LP_LB = float(os.environ.get("LP_LB", "0"))
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-_CFG = {
-    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
-    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
-    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
-    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
-}
-
-
-def model_fn():
-    args = TransformerModelArgs(
-        rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL])
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-print(f"### approx sweep MODEL={MODEL} mesh={MESH_SHAPE}{names} LP_lb={LP_LB} ###", flush=True)
-
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
-autop.__enter__()
-x = (Shard(0),) + (Replicate(),) * (ndim - 1)
-out = (Shard(0), Shard(2)) if ndim == 2 else x
-autop.add_parameter_memory_constraint(low=None, high=None)
-autop.add_input_constraints([x])
-autop.add_output_constraints([out])
-opt = autop.sharding_optimizer
-print(f"[build] lite_build={time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True)
-
-CONFIGS = [
-    ("default", dict()),
-    ("cand=256", dict(candidate_limit=256)),
-    ("cand=None", dict(candidate_limit=None)),
-    ("bp=100", dict(bp_iters=100)),
-    ("sweeps=200,star=20,t=600", dict(max_sweeps=200, star_passes=20, max_time_s=600)),
-    ("star_children=64,domain=4096", dict(max_star_children=64, group_domain_limit=4096)),
-    ("ALL generous", dict(candidate_limit=None, bp_iters=100, max_sweeps=200,
-                          star_passes=20, max_time_s=900, max_star_children=64,
-                          group_domain_limit=4096)),
-]
-
-best = None
-for name, cfg in CONFIGS:
-    t = time.perf_counter()
-    solver = ApproximateShardingSolver(opt, **cfg)
-    solver.get_solution(verbose=False)
-    dt = time.perf_counter() - t
-    ap = opt.profile["approximate"]
-    obj = ap["objective"]
-    gap = 100 * (obj - LP_LB) / LP_LB if LP_LB else float("nan")
-    winner = "bp" if ap["bp_energy"] <= ap["greedy_energy"] else "greedy"
-    print(f"[cfg] {name:30s} obj={obj:.1f} gap={gap:+.2f}% "
-          f"bp={ap['bp_energy']:.1f} greedy={ap['greedy_energy']:.1f} win={winner} "
-          f"t={dt:.1f}s", flush=True)
-    if best is None or obj < best[1]:
-        best = (name, obj)
-
-print(f"[BEST] {best[0]} obj={best[1]:.1f} "
-      f"gap={100*(best[1]-LP_LB)/LP_LB:+.2f}%" if LP_LB else f"[BEST] {best[0]} obj={best[1]:.1f}",
-      flush=True)
diff --git a/examples/_bench_build_profile.py b/examples/_bench_build_profile.py
deleted file mode 100644
index 03b6a2c9..00000000
--- a/examples/_bench_build_profile.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""Dump the lite-build phase breakdown (tracing vs strategy enumeration vs
-decision-var cost estimation) for LLaMA3-1B on a 3D mesh, to see where the
-~615s build time goes. Env: MESH, SEQLEN."""
-import json
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-if os.environ.get("DEBUG_CLUSTER") == "1":
-    h = logging.StreamHandler()
-    h.setLevel(logging.DEBUG)
-    for nm in ("autoparallel.graph_passes.graph_clustering", "autoparallel.optimize_sharding"):
-        lg = logging.getLogger(nm)
-        lg.setLevel(logging.DEBUG)
-        lg.addHandler(h)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-MODEL = os.environ.get("MODEL", "1b")
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
-_CFG = {
-    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
-    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
-    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
-    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
-}
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-
-
-def model_fn():
-    args = TransformerModelArgs(
-        rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL])
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-print(f"=== build profile: MODEL={MODEL} mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ===", flush=True)
-
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
-autop.__enter__()
-enter_s = time.perf_counter() - t
-opt = autop.sharding_optimizer
-tm = opt.profile["timings"]
-init = tm.get("init_total_s", 0.0)
-tracing = enter_s - init  # __enter__ = tracing + ShardingOptimizer construction
-
-print(json.dumps({
-    "enter_total_s": round(enter_s, 1),
-    "tracing_s (enter - optimizer_init)": round(tracing, 1),
-    "optimizer_init_total_s": round(init, 1),
-    "  strategy_enumeration_s": round(tm.get("strategy_enumeration_s", 0), 1),
-    "  decision_var_build_s": round(tm.get("decision_var_build_s", 0), 1),
-    "    compute_cost_estimation_s": round(tm.get("compute_cost_estimation_s", 0), 1),
-    "    edge_cost_estimation_s": round(tm.get("edge_cost_estimation_s", 0), 1),
-    "    pulp_var_creation_s (0 in lite)": round(tm.get("pulp_var_creation_s", 0), 1),
-    "  validation_s": round(tm.get("validation_s", 0), 1),
-    "decision_vars": len(opt.decision_vars),
-    "graph_nodes": opt.profile["model"]["graph_nodes"],
-    "strategy_options": opt.profile["strategies"]["strategy_options"],
-    "option_tuples (edges)": opt.profile["strategies"]["option_tuples"],
-}, indent=2), flush=True)
diff --git a/examples/_bench_build_verify.py b/examples/_bench_build_verify.py
deleted file mode 100644
index 08fea734..00000000
--- a/examples/_bench_build_verify.py
+++ /dev/null
@@ -1,92 +0,0 @@
-"""A/B verify that the fast build (AP_FAST_BUILD=1) produces byte-identical
-decision_vars + approx objective as the baseline (AP_FAST_BUILD=0), and report
-build time. Run the same MESH/MODEL with both env values and diff the dv_hash.
-Env: MESH, SEQLEN, MODEL (tiny|1b)."""
-import hashlib
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MODEL = os.environ.get("MODEL", "tiny")
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "4,2").split(","))
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128 if MODEL == "tiny" else 128256
-batch_size = 2 * mesh.shape[0]
-
-
-def model_fn():
-    if MODEL == "tiny":
-        args = TransformerModelArgs(dim=64, n_layers=2, n_heads=4, n_kv_heads=2,
-                                    vocab_size=vocab_size, multiple_of=32,
-                                    rope_theta=500000, max_seq_len=SEQLEN)
-    else:
-        args = TransformerModelArgs(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8,
-                                    ffn_dim_multiplier=1.5, multiple_of=256,
-                                    rope_theta=500000, vocab_size=vocab_size,
-                                    max_seq_len=SEQLEN)
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-x = (Shard(0),) + (Replicate(),) * (ndim - 1)
-out = (Shard(0), Shard(2)) if ndim == 2 else x
-
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
-autop.__enter__()
-autop.add_parameter_memory_constraint(low=None, high=None)
-autop.add_input_constraints([x])
-autop.add_output_constraints([out])
-build_s = time.perf_counter() - t
-opt = autop.sharding_optimizer
-
-# Canonical, exact dump of every decision var's costs.
-items = []
-for key in sorted(opt.decision_vars.keys()):
-    dv = opt.decision_vars[key]
-    items.append((key, repr(dv.cost), repr(dv.comm_cost), repr(dv.compute_cost),
-                  repr(dv.sharding_transition_cost)))
-dv_hash = hashlib.sha256(repr(items).encode()).hexdigest()
-
-t = time.perf_counter()
-ApproximateShardingSolver(opt).get_solution(verbose=False)
-approx_s = time.perf_counter() - t
-obj = opt.profile["approximate"]["objective"]
-
-print(f"AP_FAST_BUILD={os.environ.get('AP_FAST_BUILD', '1')}  MODEL={MODEL} "
-      f"MESH={MESH_SHAPE}  build={build_s:.2f}s  approx={approx_s:.2f}s  "
-      f"n_dv={len(opt.decision_vars)}  dv_hash={dv_hash[:32]}  "
-      f"approx_obj={obj!r}", flush=True)
diff --git a/examples/_bench_dp_alone.py b/examples/_bench_dp_alone.py
deleted file mode 100644
index 4b67c3a1..00000000
--- a/examples/_bench_dp_alone.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""Minimal approx-solver timing, for the 'dp alone' (approx WITHOUT prune)
-baseline. Run it with PYTHONPATH pointing at the dp_solver checkout to get the
-unpruned numbers, and at the merge checkout to cross-check prune+dp.
-
-Reports lite-build time, approx solve time, decision-var count and objective for
-LLaMA3-1B with the canonical constraints. Env: MESH, SEQLEN, N_LAYERS.
-"""
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-import autoparallel
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-N_LAYERS = int(os.environ.get("N_LAYERS", "0"))
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-
-
-def model_fn():
-    args = TransformerModelArgs(
-        dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5,
-        multiple_of=256, rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN)
-    if N_LAYERS:
-        args.n_layers = N_LAYERS
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-x = (Shard(0),) + (Replicate(),) * (ndim - 1)
-out = (Shard(0), Shard(2)) if ndim == 2 else x
-
-print(f"autoparallel = {autoparallel.__file__}", flush=True)
-print(f"=== dp-alone (approx) LLaMA3-1B mesh={MESH_SHAPE}{names} seqlen={SEQLEN} "
-      f"layers={N_LAYERS or 16} ===", flush=True)
-
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
-autop.__enter__()
-autop.add_parameter_memory_constraint(low=None, high=None)
-autop.add_input_constraints([x])
-autop.add_output_constraints([out])
-t_build = time.perf_counter() - t
-opt = autop.sharding_optimizer
-
-# With MERGED=1, add the propagated TP plan before solving (full joint solver).
-t_prop = 0.0
-label = "dp-alone"
-if os.environ.get("MERGED") == "1":
-    label = "merged"
-    cp = (None,) * (ndim - 1) + (Shard(0),)
-    rp = (None,) * (ndim - 1) + (Shard(1),)
-    for proj in ["wq", "wk", "wv"]:
-        autop.annotate_parameter(f"layers.*.attention.{proj}.weight", cp)
-    autop.annotate_parameter("layers.*.attention.wo.weight", rp)
-    for proj in ["w1", "w3"]:
-        autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", cp)
-    autop.annotate_parameter("layers.*.feed_forward.w2.weight", rp)
-    t = time.perf_counter()
-    autop.propagate_annotations(verbose=False, method="fix")
-    t_prop = time.perf_counter() - t
-
-t = time.perf_counter()
-ApproximateShardingSolver(opt).get_solution(verbose=False)
-t_solve = time.perf_counter() - t
-obj = opt.profile["approximate"]["objective"]
-
-print(f"[{label}] build={t_build:.2f}s  propagate={t_prop:.2f}s  "
-      f"approx_solve={t_solve:.2f}s  total={t_build + t_prop + t_solve:.2f}s  "
-      f"obj={obj:.1f}  decision_vars={len(opt.decision_vars)}", flush=True)
diff --git a/examples/_bench_lp_3d.py b/examples/_bench_lp_3d.py
deleted file mode 100644
index 5b08840b..00000000
--- a/examples/_bench_lp_3d.py
+++ /dev/null
@@ -1,107 +0,0 @@
-"""Benchmark LP-relaxation solve time for LLaMA3 on a 3D mesh."""
-import logging
-import os
-import time
-
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.WARNING)
-
-MODEL_TYPE = os.environ.get("MODEL_TYPE", "8b")
-N_LAYERS = int(os.environ.get("N_LAYERS", "0"))  # 0 => use default for model
-SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4)))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
-MESH_NAMES = ("dp", "cp", "tp")
-
-world_size = 1
-for d in MESH_SHAPE:
-    world_size *= d
-
-fake_store = FakeStore()
-torch.distributed.init_process_group("fake", store=fake_store, rank=0, world_size=world_size)
-
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=MESH_NAMES)
-
-batch_size = 2 * mesh.shape[0]
-seqlen = SEQLEN
-vocab_size = 128256
-device = torch.device("cuda")
-
-
-def model_fn():
-    if MODEL_TYPE == "1b":
-        args = TransformerModelArgs(
-            dim=2048, n_layers=16, n_heads=32, n_kv_heads=8,
-            ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000,
-            vocab_size=vocab_size, max_seq_len=seqlen,
-        )
-    elif MODEL_TYPE == "8b":
-        args = TransformerModelArgs(
-            dim=4096, n_layers=32, n_heads=32, n_kv_heads=8,
-            ffn_dim_multiplier=1.3, multiple_of=1024, rope_theta=500000,
-            vocab_size=vocab_size, max_seq_len=seqlen,
-        )
-    elif MODEL_TYPE == "70b":
-        args = TransformerModelArgs(
-            dim=8192, n_layers=80, n_heads=64, n_kv_heads=8,
-            ffn_dim_multiplier=1.3, multiple_of=4096, rope_theta=500000,
-            vocab_size=vocab_size, max_seq_len=seqlen,
-        )
-    else:
-        raise ValueError(MODEL_TYPE)
-    if N_LAYERS:
-        args.n_layers = N_LAYERS
-    return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, seqlen), device=device)
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-
-with torch.device("meta"):
-    model = model_fn()
-
-mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-
-print(f"=== model={MODEL_TYPE} n_layers={model.model_args.n_layers} "
-      f"mesh={MESH_SHAPE}{MESH_NAMES} world_size={world_size} ===")
-
-print("[build] entering AutoParallel (graph export + strategy enumeration)...", flush=True)
-t_build = time.perf_counter()
-with AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True) as autop:
-    print(f"[build] AutoParallel ready in {time.perf_counter() - t_build:.2f} s", flush=True)
-    autop.add_parameter_memory_constraint(low=None, high=None)
-    x_sharding = (Shard(0),) + (Replicate(),) * (mesh.ndim - 1)
-    autop.add_input_constraints([x_sharding])
-    autop.add_output_constraints([x_sharding])
-    print(f"[build+constraints] {time.perf_counter() - t_build:.2f} s")
-
-    opt = autop.sharding_optimizer
-    print(f"[problem] unique_vars={len(opt.pulp_variables)} "
-          f"constraints={len(opt.prob.constraints)}", flush=True)
-
-    mode = os.environ.get("SOLVE_MODE", "lp")  # lp | ilp | both
-
-    if mode in ("lp", "both"):
-        res = opt.get_lower_bound(verbose=False)
-        print(f"[LP relaxation] status={res.status} objective={res.objective:.4f}")
-        print(f"[LP relaxation] solve_s={res.solve_s:.3f}  total_s={res.total_s:.3f}", flush=True)
-
-    if mode in ("ilp", "both"):
-        print("[ILP] solving (this may take a long time)...", flush=True)
-        t_ilp = time.perf_counter()
-        opt.get_solution(verbose=True)
-        import pulp
-        obj = pulp.value(opt.prob.objective)
-        print(f"[ILP] status={pulp.LpStatus[opt.prob.status]} objective={obj}")
-        print(f"[ILP] solve+extract_s={time.perf_counter() - t_ilp:.3f}", flush=True)
diff --git a/examples/_bench_lp_integrality.py b/examples/_bench_lp_integrality.py
deleted file mode 100644
index 1c95b7e1..00000000
--- a/examples/_bench_lp_integrality.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""Re-solve the 70B LP relaxation and report how integral the optimum is: count
-fractional variables in the HiGHS solution. If ~all variables are 0/1, the LP
-optimum is reachable by integers (so an approx gap is a real solver failure); if
-many are fractional, the LP bound is loose (and the approx may be near-optimal).
-Also reports the objective with the memory constraint dropped, to test whether
-the memory budget is the fractionality source. Env: MODEL, MESH, SEQLEN."""
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import numpy as np
-import pulp
-import scipy.sparse as sp
-import torch
-from scipy.optimize import linprog
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-MODEL = os.environ.get("MODEL", "70b")
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
-DROP_MEM = os.environ.get("DROP_MEM", "0") == "1"
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-_CFG = {
-    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
-    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
-    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
-}
-
-
-def model_fn():
-    args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size,
-                                max_seq_len=SEQLEN, **_CFG[MODEL])
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-print(f"### LP integrality MODEL={MODEL} mesh={MESH_SHAPE}{names} drop_mem={DROP_MEM} ###", flush=True)
-
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
-autop.__enter__()
-x = (Shard(0),) + (Replicate(),) * (ndim - 1)
-out = (Shard(0), Shard(2)) if ndim == 2 else x
-autop.add_parameter_memory_constraint(low=None, high=None)
-autop.add_input_constraints([x])
-autop.add_output_constraints([out])
-opt = autop.sharding_optimizer
-print(f"[build] full_build={time.perf_counter()-t:.1f}s", flush=True)
-
-opt._set_objective()
-if not DROP_MEM:
-    opt._apply_memory_constraint()
-variables = opt.prob.variables()
-vidx = {id(v): i for i, v in enumerate(variables)}
-n = len(variables)
-c = np.zeros(n)
-for key, dv in opt.decision_vars.items():
-    mult = 1 + len(opt._root_to_copies.get(key[0], ()))
-    c[vidx[id(dv.var)]] += dv.cost * mult
-re = ru = 0
-reqr, reqc, reqd, beq = [], [], [], []
-rubr, rubc, rubd, bub = [], [], [], []
-for con in opt.prob.constraints.values():
-    rhs = -con.constant
-    if con.sense == pulp.LpConstraintEQ:
-        for v, co in con.items():
-            reqr.append(re); reqc.append(vidx[id(v)]); reqd.append(co)
-        beq.append(rhs); re += 1
-    else:
-        sgn = 1.0 if con.sense == pulp.LpConstraintLE else -1.0
-        for v, co in con.items():
-            rubr.append(ru); rubc.append(vidx[id(v)]); rubd.append(sgn * co)
-        bub.append(sgn * rhs); ru += 1
-A_eq = sp.csr_matrix((reqd, (reqr, reqc)), shape=(re, n)) if re else None
-A_ub = sp.csr_matrix((rubd, (rubr, rubc)), shape=(ru, n)) if ru else None
-t = time.perf_counter()
-res = linprog(c, A_ub=A_ub, b_ub=(bub or None), A_eq=A_eq, b_eq=(beq or None),
-              bounds=(0, 1), method="highs-ds", options={"disp": True})
-print(f"[lp] solve={time.perf_counter()-t:.1f}s status={res.message}", flush=True)
-xv = res.x
-freq = np.abs(xv - np.round(xv))
-nfrac = int((freq > 1e-6).sum())
-nfrac4 = int((freq > 1e-4).sum())
-# weight fractionality by objective contribution to see if it matters
-frac_obj = float(np.abs(c * freq).sum())
-print(f"[RESULT] MODEL={MODEL} drop_mem={DROP_MEM} obj={res.fun:.1f} "
-      f"vars={n} fractional(>1e-6)={nfrac} ({100*nfrac/n:.4f}%) "
-      f"fractional(>1e-4)={nfrac4} frac_obj_weight={frac_obj:.1f}", flush=True)
diff --git a/examples/_bench_mem_lagrangian.py b/examples/_bench_mem_lagrangian.py
deleted file mode 100644
index 6166a552..00000000
--- a/examples/_bench_mem_lagrangian.py
+++ /dev/null
@@ -1,237 +0,0 @@
-"""Compare the Lagrangian memory-constrained approximate solve against the LP
-(relaxation) optimum across a sweep of parameter-memory budgets.
-
-The optimizer (the expensive build) is constructed ONCE; each budget only
-re-runs the cheap solves. For every budget factor `high` (with low=0):
-  - LP: set the memory constraint and solve the (integral) relaxation -> the
-    exact constrained optimum (gold standard).
-  - Lagrangian approx: fold lambda * ratio into the unaries and bisect lambda
-    until the achieved memory lands in the same [low, high] budget.
-The two solvers are pinned to the SAME numeric budget (read back from the LP's
-constraint rows) so the comparison is apples-to-apples.
-
-Env: MODEL_TYPE (1b|8b), MESH ("8,8"), N_LAYERS (0=default), SEQLEN,
-HIGH_FACTORS (comma list, default sweep), BP_ITERS.
-"""
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import pulp
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-
-
-def log(msg):
-    print(msg, flush=True)
-
-
-_PATCHES = [
-    patch("torch.cuda.device_count", lambda: 8),
-    patch("torch.cuda.get_device_name", lambda *a, **k: "H100"),
-    patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)),
-    patch(
-        "torch.cuda.get_device_properties",
-        lambda *a, **k: type(
-            "P",
-            (),
-            {
-                "major": 9,
-                "minor": 0,
-                "name": "H100",
-                "total_memory": 80 * 1024**3,
-                "multi_processor_count": 132,
-            },
-        )(),
-    ),
-]
-for p in _PATCHES:
-    p.start()
-
-MODEL_TYPE = os.environ.get("MODEL_TYPE", "1b")
-N_LAYERS = int(os.environ.get("N_LAYERS", "0"))
-SEQLEN = int(os.environ.get("SEQLEN", str(2048 * 4)))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
-BP_ITERS = int(os.environ.get("BP_ITERS", "120"))
-HIGH_FACTORS = [
-    float(x)
-    for x in os.environ.get(
-        "HIGH_FACTORS", "0.0156,0.03125,0.0625,0.125,0.25,0.5,1.0"
-    ).split(",")
-]
-# On budgets where the LP relaxation is fractional (its optimum is an
-# unachievable lower bound) also solve the true ILP to report the achievable gap.
-RUN_ILP = os.environ.get("RUN_ILP", "0") == "1"
-ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "300"))
-
-world_size = 1
-for d in MESH_SHAPE:
-    world_size *= d
-
-_NAMES = {1: ("dp",), 2: ("dp", "tp"), 3: ("dp", "cp", "tp")}
-mesh_names = _NAMES[len(MESH_SHAPE)]
-fake_store = FakeStore()
-torch.distributed.init_process_group(
-    "fake", store=fake_store, rank=0, world_size=world_size
-)
-mesh = torch.distributed.device_mesh.init_device_mesh(
-    "cuda", MESH_SHAPE, mesh_dim_names=mesh_names
-)
-
-vocab_size = 128256
-batch_size = int(os.environ.get("BATCH", str(2 * mesh.shape[0])))
-seqlen = SEQLEN
-
-
-def model_fn():
-    args = TransformerModelArgs(
-        dim=2048,
-        n_layers=16,
-        n_heads=32,
-        n_kv_heads=8,
-        ffn_dim_multiplier=1.5,
-        multiple_of=256,
-        rope_theta=500000,
-        vocab_size=vocab_size,
-        max_seq_len=seqlen,
-    )
-    if MODEL_TYPE == "8b":
-        args = TransformerModelArgs(
-            dim=4096,
-            n_layers=32,
-            n_heads=32,
-            n_kv_heads=8,
-            ffn_dim_multiplier=1.3,
-            multiple_of=1024,
-            rope_theta=500000,
-            vocab_size=vocab_size,
-            max_seq_len=seqlen,
-        )
-    if N_LAYERS:
-        args.n_layers = N_LAYERS
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda")
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-
-log(
-    f"model={MODEL_TYPE} layers={N_LAYERS or 'default'} mesh={MESH_SHAPE} "
-    f"world={world_size} seqlen={seqlen} bp_iters={BP_ITERS}"
-)
-
-# ---- build once ----
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True)
-autop.__enter__()
-ndim = mesh.ndim
-x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1)
-out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding
-# Build with a LOOSE budget so the approx build does not pin params to the
-# min-ratio (fully-sharded) choices; the per-budget sweep overrides the budget
-# numerically afterward. (A tight default would prune param strategies at build
-# time and freeze the achievable memory.)
-autop.add_parameter_memory_constraint(low=0.0, high=1.0)
-autop.add_input_constraints([x_sharding])
-autop.add_output_constraints([out_sharding])
-opt = autop.sharding_optimizer
-log(
-    f"[build] optimizer ready in {time.perf_counter() - t:.2f}s "
-    f"vars={len(opt.pulp_variables)} nodes={len(opt.nodes)}"
-)
-
-# build the approximate solver once (ratios / factor graph / mem unary cached)
-t = time.perf_counter()
-approx = ApproximateShardingSolver(opt, bp_iters=BP_ITERS)
-approx._build_problem()
-approx._build_factors()
-approx._build_mem_unary()
-log(
-    f"[build] approx solver ready in {time.perf_counter() - t:.2f}s "
-    f"groups={len(approx.groups)} "
-    f"params={len(approx._memory['param_idxs']) if approx._memory else 0}"
-)
-opt._set_objective()
-
-
-def lp_budget():
-    """Read back the exact [low, high] the LP applied, so approx uses the same."""
-    ch = opt.prob.constraints["memory_constraint_high"]
-    cl = opt.prob.constraints["memory_constraint_low"]
-    return -cl.constant, -ch.constant
-
-
-log("\n" + "=" * 110)
-log(
-    f"{'high_f':>8} | {'budget':>16} | {'LP obj':>12} {'frac':>7} {'LP s':>6} | "
-    f"{'approx obj':>12} {'mem':>7} {'lam':>9} {'feas':>5} {'s':>5} | "
-    f"{'gap/LP':>7} {'ILP obj':>12} {'gap/ILP':>8}"
-)
-log("-" * 110)
-
-rows = []
-for hf in HIGH_FACTORS:
-    opt._memory_constraint = (0.0, hf)
-    t = time.perf_counter()
-    lp = opt.solve_lp_relaxation(verbose=False, extract=False)
-    lp_s = time.perf_counter() - t
-    lp_obj = lp["objective"]
-    frac = f"{lp['n_fractional']}/{lp['n_vars']}"
-    blow, bhigh = lp_budget()
-
-    approx._memory["budget_low"] = blow
-    approx._memory["budget_high"] = bhigh
-    approx._memory["tight"] = abs(bhigh - blow) < 1e-9
-    t = time.perf_counter()
-    res = approx.solve_lagrangian(blow, bhigh, max_iter=24)
-    ap_s = time.perf_counter() - t
-    ap_obj = res["objective"]
-    gap = (ap_obj - lp_obj) / lp_obj * 100 if lp_obj else float("nan")
-
-    ilp_obj, gap_ilp = None, None
-    if RUN_ILP and lp["n_fractional"] > 0:
-        opt._set_objective()
-        opt._apply_memory_constraint()
-        opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, timeLimit=ILP_TIMEOUT))
-        ilp_obj = pulp.value(opt.prob.objective)
-        gap_ilp = (ap_obj - ilp_obj) / ilp_obj * 100 if ilp_obj else float("nan")
-
-    rows.append((hf, lp_obj, ap_obj, gap, res["feasible"], ilp_obj, gap_ilp))
-    log(
-        f"{hf:>8.4g} | [{blow:>6.3f},{bhigh:>7.3f}] | {lp_obj:>12.1f} {frac:>7} "
-        f"{lp_s:>5.1f}s | {ap_obj:>12.1f} {res['memory']:>7.3f} {res['lam']:>9.4g} "
-        f"{str(res['feasible']):>5} {ap_s:>4.1f}s | {gap:>+6.2f}% "
-        f"{('%.1f' % ilp_obj) if ilp_obj else '-':>12} "
-        f"{('%+.2f%%' % gap_ilp) if gap_ilp is not None else '-':>8}"
-    )
-
-log("=" * 110)
-gaps = [r[3] for r in rows if r[1]]
-feas = [r[4] for r in rows]
-if gaps:
-    log(
-        f"gap vs LP: mean={sum(gaps)/len(gaps):+.2f}% max={max(gaps):+.2f}% "
-        f"min={min(gaps):+.2f}%  feasible={sum(feas)}/{len(feas)}"
-    )
-gi = [r[6] for r in rows if r[6] is not None]
-if gi:
-    log(
-        f"gap vs ILP (fractional-LP budgets): mean={sum(gi)/len(gi):+.2f}% "
-        f"max={max(gi):+.2f}%"
-    )
diff --git a/examples/_bench_merge.py b/examples/_bench_merge.py
deleted file mode 100644
index c6249021..00000000
--- a/examples/_bench_merge.py
+++ /dev/null
@@ -1,293 +0,0 @@
-"""Joint-optimization benchmark: prune (+ annotated) + dp (approx) vs each alone.
-
-Measures, for LLaMA3-1B on a 2D or 3D mesh with the canonical example_llama3
-constraints, four optimization configurations on the SAME traced model:
-
-  prune     : full ILP build  + exact CBC solve            (== prune_search_space)
-  annotated : full ILP build  + propagate(fix) + CBC solve (== annotated_search)
-  dp        : lite build      + approx solve               (== dp_solver)
-  merged    : lite build      + propagate(fix) + approx    (this branch)
-
-Reports each config's build/solve/total time and objective, the LP-relaxation
-lower bound (an optimality certificate), and checks the acceptance criteria:
-
-  * merged objective within 10% (ideally 5%) of the ILP optimum, and
-  * merged total time < every individual optimization's total time.
-
-Env knobs: MESH ("8,8" 2D / "2,4,8" 3D), ILP_TIMEOUT (s, 0=unlimited),
-N_LAYERS (0=default 16), SEQLEN.
-"""
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import pulp
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-
-
-def log(msg=""):
-    print(msg, flush=True)
-
-
-# Fake an 8-GPU H100 node so the cost model runs without real GPUs.
-_PATCHES = [
-    patch("torch.cuda.device_count", lambda: 8),
-    patch("torch.cuda.get_device_name", lambda *a, **k: "H100"),
-    patch("torch.cuda.get_device_capability", lambda *a, **k: (9, 0)),
-    patch(
-        "torch.cuda.get_device_properties",
-        lambda *a, **k: type(
-            "P", (), {"major": 9, "minor": 0, "name": "H100",
-                      "total_memory": 80 * 1024**3, "multi_processor_count": 132}
-        )(),
-    ),
-]
-for p in _PATCHES:
-    p.start()
-
-N_LAYERS = int(os.environ.get("N_LAYERS", "0"))
-SEQLEN = int(os.environ.get("SEQLEN", str(2048)))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
-ILP_TIMEOUT = float(os.environ.get("ILP_TIMEOUT", "0"))
-
-world_size = 1
-for d in MESH_SHAPE:
-    world_size *= d
-_NAMES = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}
-mesh_names = _NAMES[len(MESH_SHAPE)]
-fake_store = FakeStore()
-torch.distributed.init_process_group(
-    "fake", store=fake_store, rank=0, world_size=world_size
-)
-mesh = torch.distributed.device_mesh.init_device_mesh(
-    "cuda", MESH_SHAPE, mesh_dim_names=mesh_names
-)
-ndim = mesh.ndim
-
-# MODEL=1b is the real LLaMA3-1B; MODEL=small is a tractable proxy whose smaller
-# tensors yield few enough decision variables that the exact ILP/LP-bound finish
-# on a 3D mesh (where the 1B PuLP problem has ~8M variables and is impractical),
-# letting us certify the approximate solver's gap on real 3D structure.
-MODEL = os.environ.get("MODEL", "1b")
-vocab_size = 1024 if MODEL == "small" else 128256
-batch_size = 2 * mesh.shape[0]
-seqlen = SEQLEN
-
-
-def model_fn():
-    if MODEL == "small":
-        args = TransformerModelArgs(
-            dim=256, n_layers=4, n_heads=8, n_kv_heads=4,
-            multiple_of=64, rope_theta=500000,
-            vocab_size=vocab_size, max_seq_len=seqlen,
-        )
-    else:
-        args = TransformerModelArgs(
-            dim=2048, n_layers=16, n_heads=32, n_kv_heads=8,
-            ffn_dim_multiplier=1.5, multiple_of=256, rope_theta=500000,
-            vocab_size=vocab_size, max_seq_len=seqlen,
-        )
-    if N_LAYERS:
-        args.n_layers = N_LAYERS
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, seqlen), device="cuda")
-
-
-# Canonical TP plan: column-parallel q/k/v/w1/w3, row-parallel wo/w2, pinning
-# only the tensor-parallel (last) mesh axis; data/cp axes left to the optimizer.
-COLUMN_PARALLEL = (None,) * (ndim - 1) + (Shard(0),)
-ROW_PARALLEL = (None,) * (ndim - 1) + (Shard(1),)
-
-
-def annotate_tp_plan(autop):
-    for proj in ["wq", "wk", "wv"]:
-        autop.annotate_parameter(f"layers.*.attention.{proj}.weight", COLUMN_PARALLEL)
-    autop.annotate_parameter("layers.*.attention.wo.weight", ROW_PARALLEL)
-    for proj in ["w1", "w3"]:
-        autop.annotate_parameter(f"layers.*.feed_forward.{proj}.weight", COLUMN_PARALLEL)
-    autop.annotate_parameter("layers.*.feed_forward.w2.weight", ROW_PARALLEL)
-
-
-def add_constraints(autop):
-    x_sharding = (Shard(0),) + (Replicate(),) * (ndim - 1)
-    out_sharding = (Shard(0), Shard(2)) if ndim == 2 else x_sharding
-    autop.add_parameter_memory_constraint(low=None, high=None)
-    autop.add_input_constraints([x_sharding])
-    autop.add_output_constraints([out_sharding])
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-
-log(f"=== LLaMA3-{MODEL}  mesh={MESH_SHAPE}{mesh_names}  world={world_size}  "
-    f"seqlen={seqlen}  vocab={vocab_size}  layers={N_LAYERS or '(default)'} ===")
-results = {}  # name -> dict(build, solve, total, obj)
-
-
-def build(build_pulp):
-    t = time.perf_counter()
-    autop = AutoParallel(
-        model_fn(), input_fn, mesh, mp, repeated_subgraphs=True,
-        solver="ilp" if build_pulp else "approx",
-    )
-    autop.__enter__()
-    add_constraints(autop)
-    return autop, time.perf_counter() - t
-
-
-# ---------- full PuLP build: prune (ILP) + annotated (ILP) + LP bound ----------
-autop_full, build_full = build(build_pulp=True)
-opt = autop_full.sharding_optimizer
-log(f"\n[full build] {build_full:.2f}s  decision_vars={len(opt.decision_vars)}  "
-    f"pulp_vars={len(opt.pulp_variables)}  constraints={len(opt.prob.constraints)}")
-
-# prune: exact ILP solve. preprocess-off is part of the prune optimization, and
-# _apply_memory_constraint installs the same budget the approx solver enforces,
-# so every config solves the identical constrained problem.
-opt._set_objective()
-opt._apply_memory_constraint()
-kw = {"msg": False, "options": ["preprocess off"]}
-if ILP_TIMEOUT > 0:
-    kw["timeLimit"] = ILP_TIMEOUT
-t = time.perf_counter()
-opt.prob.solve(pulp.PULP_CBC_CMD(**kw))
-t_ilp = time.perf_counter() - t
-obj_opt = pulp.value(opt.prob.objective)
-ilp_status = pulp.LpStatus[opt.prob.status]
-results["prune"] = dict(build=build_full, solve=t_ilp, total=build_full + t_ilp,
-                        obj=obj_opt)
-log(f"[prune    ] ILP solve {t_ilp:8.2f}s  obj={obj_opt:11.1f}  status={ilp_status}")
-
-# LP-relaxation lower bound: certifies the optimality gap without a full ILP
-# (this sharding LP is empirically integral, so the bound equals the optimum).
-lb_res = opt.get_lower_bound(verbose=False)
-lb = lb_res.objective
-log(f"[LP-bound ] solve {lb_res.solve_s:8.2f}s  lower_bound={lb:11.1f}")
-
-# annotated: propagate the TP plan, then exact ILP solve on the reduced problem.
-annotate_tp_plan(autop_full)
-t = time.perf_counter()
-prop = autop_full.propagate_annotations(verbose=False, method="fix")
-t_prop_full = time.perf_counter() - t
-opt._apply_memory_constraint()
-t = time.perf_counter()
-opt.prob.solve(pulp.PULP_CBC_CMD(**kw))
-t_ilp_ann = time.perf_counter() - t
-obj_ann = pulp.value(opt.prob.objective)
-results["annotated"] = dict(build=build_full, solve=t_prop_full + t_ilp_ann,
-                            total=build_full + t_prop_full + t_ilp_ann, obj=obj_ann)
-log(f"[annotated] propagate {t_prop_full:.2f}s + ILP {t_ilp_ann:.2f}s  "
-    f"obj={obj_ann:11.1f}  (pinned {prop.nodes_determined} nodes, "
-    f"-{100*prop.reduction:.0f}% strategies)")
-
-# Tear down before the next build: AutoParallel installs a FakeTensorMode, and
-# two entered instances can't coexist.
-autop_full.__exit__(None, None, None)
-
-# ---------- lite build: dp=prune+approx + merged=prune+approx+annotated -------
-autop_lite, build_lite = build(build_pulp=False)
-opt_l = autop_lite.sharding_optimizer
-log(f"\n[lite build] {build_lite:.2f}s  decision_vars={len(opt_l.decision_vars)}  "
-    f"pulp_vars={len(opt_l.pulp_variables)} (no PuLP problem)")
-
-# dp: approximate solve, no annotations.
-t = time.perf_counter()
-ApproximateShardingSolver(opt_l).get_solution(verbose=False)
-t_approx_dp = time.perf_counter() - t
-obj_dp = opt_l.profile["approximate"]["objective"]
-results["dp"] = dict(build=build_lite, solve=t_approx_dp, total=build_lite + t_approx_dp,
-                     obj=obj_dp)
-log(f"[dp       ] approx solve {t_approx_dp:8.2f}s  obj={obj_dp:11.1f}")
-
-# merged: propagate the TP plan, then approximate solve on the reduced problem.
-annotate_tp_plan(autop_lite)
-t = time.perf_counter()
-prop_l = autop_lite.propagate_annotations(verbose=False, method="fix")
-t_prop_lite = time.perf_counter() - t
-t = time.perf_counter()
-ApproximateShardingSolver(opt_l).get_solution(verbose=False)
-t_approx_merged = time.perf_counter() - t
-obj_merged = opt_l.profile["approximate"]["objective"]
-results["merged"] = dict(build=build_lite, solve=t_prop_lite + t_approx_merged,
-                         total=build_lite + t_prop_lite + t_approx_merged, obj=obj_merged)
-log(f"[merged   ] propagate {t_prop_lite:.2f}s + approx {t_approx_merged:.2f}s  "
-    f"obj={obj_merged:11.1f}  (pinned {prop_l.nodes_determined} nodes)")
-
-autop_lite.__exit__(None, None, None)
-
-# ---------- report ----------
-# Optimality reference: exact ILP optimum if CBC proved it, else the LP lower
-# bound (this sharding LP is empirically integral, so lb == optimum).
-optimal = obj_opt if ilp_status == "Optimal" else lb
-opt_label = "ILP optimum" if ilp_status == "Optimal" else "LP lower bound"
-
-LABELS = {
-    "prune": "prune (ILP)",
-    "annotated": "annotated (ILP)",
-    "dp": "prune+dp (approx)",
-    "merged": "prune+dp+anno",
-}
-log("\n" + "=" * 78)
-log(f"{'config':<20}{'build(s)':>10}{'solve(s)':>10}{'total(s)':>10}"
-    f"{'objective':>13}{'gap%':>9}")
-log("-" * 78)
-for name in ["prune", "annotated", "dp", "merged"]:
-    r = results[name]
-    gap = 100 * (r["obj"] - optimal) / optimal
-    log(f"{LABELS[name]:<20}{r['build']:>10.2f}{r['solve']:>10.2f}{r['total']:>10.2f}"
-        f"{r['obj']:>13.1f}{gap:>+9.2f}")
-log("=" * 78)
-log(f"optimality reference: {opt_label} = {optimal:.1f}  (ILP status={ilp_status})")
-
-# Core joint optimization is prune + dp (the approximate solver on the pruned
-# space); annotation is the optional extra speedup. Report both gaps.
-gap_core = 100 * (obj_dp - optimal) / optimal
-gap_full = 100 * (obj_merged - optimal) / optimal
-log(f"\nobjective gap vs {opt_label}:")
-log(f"  prune+dp (approx)      : {gap_core:+.2f}%   (core: prune + dp)")
-log(f"  prune+dp+annotated     : {gap_full:+.2f}%   (+ optional annotation)")
-
-# Timing: the joint solver must beat each ILP-based individual optimization.
-# (dp alone == approx WITHOUT prune is measured against the dp_solver checkout
-#  separately; prune makes the joint build/solve strictly cheaper than that.)
-log("\njoint total time (build+solve) vs each individual optimization:")
-all_faster = True
-for joint in ["dp", "merged"]:
-    tj = results[joint]["total"]
-    line_ok = True
-    for name in ["prune", "annotated"]:
-        to = results[name]["total"]
-        faster = tj < to
-        line_ok = line_ok and faster
-        log(f"  {LABELS[joint]:<18} {tj:7.2f}s  {'<' if faster else '>='} "
-            f"{LABELS[name]:<16} {to:7.2f}s   {to / tj:5.1f}x  "
-            f"{'OK' if faster else 'FAIL'}")
-    all_faster = all_faster and line_ok
-
-log("\n" + "=" * 78)
-# The full three-way joint (prune + dp + annotated) is the deliverable: the
-# approx solver alone is ~20% off, but the propagated TP plan steers it to the
-# optimum. Annotation is therefore what meets the accuracy bar; prune+dp alone
-# trades accuracy for a little more speed.
-ok_gap = abs(gap_full) <= 10.0
-log(f"ACCEPTANCE gap<=10% (full joint prune+dp+anno): {ok_gap}  "
-    f"(full={gap_full:+.2f}%, <=5%: {abs(gap_full) <= 5.0})")
-log(f"  (informational: prune+dp without annotation = {gap_core:+.2f}%)")
-log(f"ACCEPTANCE joint faster than ILP-based optimizations: {all_faster}")
-log(f"OVERALL: {'PASS' if ok_gap and all_faster else 'CHECK'}")
diff --git a/examples/_bench_sizes.py b/examples/_bench_sizes.py
deleted file mode 100644
index 46962209..00000000
--- a/examples/_bench_sizes.py
+++ /dev/null
@@ -1,166 +0,0 @@
-"""e2e prune+dp (approx) search across LLaMA3 sizes: latency + accuracy.
-
-For one MODEL on one MESH:
-  * latency: lite build (build_pulp=False) + ApproximateShardingSolver -> the
-    production prune+dp path (build_s, approx_s, total, objective).
-  * accuracy: a separate full PuLP build -> HiGHS LP-relaxation lower bound
-    (this sharding LP is integral, so the bound equals the exact ILP optimum);
-    gap = (approx_obj - lb) / lb.
-
-Env: MODEL (1b|3b|8b|70b), MESH (e.g. 2,4,8), SEQLEN. One model per process.
-"""
-import gc
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import numpy as np
-import pulp
-import scipy.sparse as sp
-import torch
-from scipy.optimize import linprog
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-MODEL = os.environ.get("MODEL", "1b")
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "2,4,8").split(","))
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-
-_CFG = {
-    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
-    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
-    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
-    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
-}
-
-
-def model_fn():
-    args = TransformerModelArgs(
-        rope_theta=500000, vocab_size=vocab_size, max_seq_len=SEQLEN, **_CFG[MODEL]
-    )
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-def constrain(autop):
-    x = (Shard(0),) + (Replicate(),) * (ndim - 1)
-    out = (Shard(0), Shard(2)) if ndim == 2 else x
-    autop.add_parameter_memory_constraint(low=None, high=None)
-    autop.add_input_constraints([x])
-    autop.add_output_constraints([out])
-
-
-def lp_lower_bound_highs(opt):
-    """LP relaxation (binaries -> [0,1]) of the built problem, solved with HiGHS.
-    Objective is read from decision_vars and constraints from prob.constraints
-    using id()-keyed indexing (avoids hashing the long PuLP var names)."""
-    opt._set_objective()
-    opt._apply_memory_constraint()
-    variables = opt.prob.variables()
-    vidx = {id(v): i for i, v in enumerate(variables)}
-    n = len(variables)
-    c = np.zeros(n)
-    for key, dv in opt.decision_vars.items():
-        mult = 1 + len(opt._root_to_copies.get(key[0], ()))
-        c[vidx[id(dv.var)]] += dv.cost * mult
-    re = ru = 0
-    reqr, reqc, reqd, beq = [], [], [], []
-    rubr, rubc, rubd, bub = [], [], [], []
-    for con in opt.prob.constraints.values():
-        rhs = -con.constant
-        if con.sense == pulp.LpConstraintEQ:
-            for v, co in con.items():
-                reqr.append(re); reqc.append(vidx[id(v)]); reqd.append(co)
-            beq.append(rhs); re += 1
-        else:
-            s = 1.0 if con.sense == pulp.LpConstraintLE else -1.0
-            for v, co in con.items():
-                rubr.append(ru); rubc.append(vidx[id(v)]); rubd.append(s * co)
-            bub.append(s * rhs); ru += 1
-    A_eq = sp.csr_matrix((reqd, (reqr, reqc)), shape=(re, n)) if re else None
-    A_ub = sp.csr_matrix((rubd, (rubr, rubc)), shape=(ru, n)) if ru else None
-    # Dual simplex: far faster than the barrier (IPM) on this near-integral,
-    # network-flow-like LP. We only need the optimal objective as the bound.
-    method = os.environ.get("LP_METHOD", "highs-ds")
-    res = linprog(c, A_ub=A_ub, b_ub=(bub or None), A_eq=A_eq, b_eq=(beq or None),
-                  bounds=(0, 1), method=method, options={"disp": True})
-    if not res.success:
-        raise RuntimeError(f"HiGHS failed: {res.message}")
-    return res.fun, n, re + ru
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-print(f"### MODEL={MODEL} mesh={MESH_SHAPE}{names} seqlen={SEQLEN} ###", flush=True)
-
-# ---- latency: lite build + prune+dp approx (production path) ----
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="approx")
-autop.__enter__()
-constrain(autop)
-build_lite = time.perf_counter() - t
-opt = autop.sharding_optimizer
-n_dv = len(opt.decision_vars)
-params = opt.profile["model"]["parameter_numel"]
-t = time.perf_counter()
-ApproximateShardingSolver(opt).get_solution(verbose=False)
-approx_s = time.perf_counter() - t
-obj = opt.profile["approximate"]["objective"]
-print(f"[latency] params={params/1e9:.2f}B  lite_build={build_lite:.1f}s  "
-      f"approx={approx_s:.1f}s  total={build_lite + approx_s:.1f}s  "
-      f"decision_vars={n_dv}  obj={obj:.1f}", flush=True)
-autop.__exit__(None, None, None)
-del autop, opt
-gc.collect()
-
-if os.environ.get("ACCURACY", "1") != "1":
-    print(f"[RESULT] MODEL={MODEL} params={params/1e9:.2f}B  "
-          f"prune+dp: build={build_lite:.1f}s approx={approx_s:.1f}s "
-          f"total={build_lite+approx_s:.1f}s  obj={obj:.1f}  (LP skipped)", flush=True)
-    raise SystemExit(0)
-
-# ---- accuracy: full build + HiGHS LP lower bound ----
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver="ilp")
-autop.__enter__()
-constrain(autop)
-full_build = time.perf_counter() - t
-opt = autop.sharding_optimizer
-t = time.perf_counter()
-lb, nvar, ncon = lp_lower_bound_highs(opt)
-lp_s = time.perf_counter() - t
-gap = 100 * (obj - lb) / lb
-print(f"[accuracy] full_build={full_build:.1f}s  lp_solve={lp_s:.1f}s  "
-      f"lower_bound={lb:.1f}  vars={nvar} cons={ncon}", flush=True)
-print(f"[RESULT] MODEL={MODEL} params={params/1e9:.2f}B  "
-      f"prune+dp: build={build_lite:.1f}s approx={approx_s:.1f}s total={build_lite+approx_s:.1f}s  "
-      f"obj={obj:.1f}  LP_lb={lb:.1f}  gap={gap:+.2f}%", flush=True)
diff --git a/examples/_bench_trws.py b/examples/_bench_trws.py
deleted file mode 100644
index 4e4fbc2d..00000000
--- a/examples/_bench_trws.py
+++ /dev/null
@@ -1,173 +0,0 @@
-"""Prototype TRW-S (tree-reweighted sequential message passing) on the approx
-solver's faithful factor graph, validated against the CBC-exact optimum. If TRW-S
-(optionally + the existing local search) reaches the optimum where plain min-sum
-BP does not, it is the fix. Env: MODEL, MESH, SEQLEN, ITERS."""
-import logging
-import os
-import time
-from unittest.mock import patch
-
-import numpy as np
-import pulp
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-from autoparallel.approximate_sharding import ApproximateShardingSolver
-from autoparallel.cost_models.collective_runtime_estimation import set_nccl_topo_config
-from autoparallel.cost_models.nccl_cost_model import detect_nccl_topo_config
-
-logging.basicConfig(level=logging.ERROR)
-for fn, val in [("device_count", lambda: 8), ("get_device_name", lambda *a, **k: "H100"),
-                ("get_device_capability", lambda *a, **k: (9, 0))]:
-    patch(f"torch.cuda.{fn}", val).start()
-patch("torch.cuda.get_device_properties", lambda *a, **k: type(
-    "P", (), {"major": 9, "minor": 0, "name": "H100",
-              "total_memory": 80 * 1024**3, "multi_processor_count": 132})()).start()
-
-MODEL = os.environ.get("MODEL", "1b")
-SEQLEN = int(os.environ.get("SEQLEN", "2048"))
-MESH_SHAPE = tuple(int(x) for x in os.environ.get("MESH", "8,8").split(","))
-ITERS = int(os.environ.get("ITERS", "1000"))
-USE_CBC = os.environ.get("CBC", "1") == "1"
-ws = 1
-for d in MESH_SHAPE:
-    ws *= d
-names = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}[len(MESH_SHAPE)]
-torch.distributed.init_process_group("fake", store=FakeStore(), rank=0, world_size=ws)
-mesh = torch.distributed.device_mesh.init_device_mesh("cuda", MESH_SHAPE, mesh_dim_names=names)
-ndim = mesh.ndim
-vocab_size = 128256
-batch_size = 2 * mesh.shape[0]
-_CFG = {
-    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
-    "3b": dict(dim=3072, n_layers=28, n_heads=24, n_kv_heads=8, ffn_dim_multiplier=1.0, multiple_of=256),
-    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
-    "70b": dict(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=4096),
-}
-
-
-def model_fn():
-    args = TransformerModelArgs(rope_theta=500000, vocab_size=vocab_size,
-                                max_seq_len=SEQLEN, **_CFG[MODEL])
-    with torch.device("meta"):
-        return Transformer(args)
-
-
-def input_fn():
-    return torch.randint(0, vocab_size, (batch_size, SEQLEN), device="cuda")
-
-
-def constrain(autop):
-    x = (Shard(0),) + (Replicate(),) * (ndim - 1)
-    out = (Shard(0), Shard(2)) if ndim == 2 else x
-    autop.add_parameter_memory_constraint(low=None, high=None)
-    autop.add_input_constraints([x])
-    autop.add_output_constraints([out])
-
-
-set_nccl_topo_config(detect_nccl_topo_config(mesh))
-mp = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-print(f"### TRW-S MODEL={MODEL} mesh={MESH_SHAPE}{names} iters={ITERS} ###", flush=True)
-
-backend = "ilp" if USE_CBC else "approx"
-t = time.perf_counter()
-autop = AutoParallel(model_fn(), input_fn, mesh, mp, repeated_subgraphs=True, solver=backend)
-autop.__enter__()
-constrain(autop)
-opt = autop.sharding_optimizer
-print(f"[build] {time.perf_counter()-t:.1f}s decision_vars={len(opt.decision_vars)}", flush=True)
-
-obj_cbc = None
-if USE_CBC:
-    opt._set_objective()
-    opt._apply_memory_constraint()
-    t = time.perf_counter()
-    opt.prob.solve(pulp.PULP_CBC_CMD(msg=False, options=["preprocess off"]))
-    obj_cbc = pulp.value(opt.prob.objective)
-    print(f"[cbc] obj={obj_cbc:.1f} status={pulp.LpStatus[opt.prob.status]} "
-          f"({time.perf_counter()-t:.1f}s)", flush=True)
-
-
-_REF = obj_cbc if obj_cbc else float(os.environ.get("LP_LB", "0")) or None
-
-
-def gap(o):
-    return 100 * (o - _REF) / _REF if _REF else float("nan")
-
-
-# Stock approx (BP + local search) for comparison.
-a0 = ApproximateShardingSolver(opt)
-t = time.perf_counter()
-a0.get_solution(verbose=False)
-print(f"[stock approx] obj={opt.profile['approximate']['objective']:.1f} "
-      f"gap={gap(opt.profile['approximate']['objective']):+.2f}% ({time.perf_counter()-t:.1f}s)", flush=True)
-
-# Build a fresh factor graph for TRW-S.
-A = ApproximateShardingSolver(opt)
-A._build_problem()
-A._build_factors()
-G = len(A.groups)
-nbrs = A.nbrs
-unary = A.g_unary
-order = sorted(range(G), key=lambda g: min(A.groups[g].members))
-pos = [0] * G
-for i, g in enumerate(order):
-    pos[g] = i
-gamma = []
-for g in range(G):
-    indeg = sum(1 for h in nbrs[g] if pos[h] < pos[g])
-    outdeg = sum(1 for h in nbrs[g] if pos[h] > pos[g])
-    gamma.append(1.0 / max(1, max(indeg, outdeg)))
-
-msg = {}
-for g in range(G):
-    for h in nbrs[g]:
-        msg[(g, h)] = np.zeros(len(unary[h]))
-
-t = time.perf_counter()
-best = float("inf")
-best_snap = None
-for it in range(ITERS):
-    for forward in (True, False):
-        seq = order if forward else order[::-1]
-        for p in seq:
-            if not nbrs[p]:
-                continue
-            agg = unary[p].copy()
-            for r in nbrs[p]:
-                agg += msg[(r, p)]
-            wp = gamma[p] * agg
-            for q in nbrs[p]:
-                if (pos[q] > pos[p]) != forward:
-                    continue
-                P = A._pair_matrix(p, q)  # (D_p, D_q)
-                mm = (wp - msg[(q, p)])[:, None] + P
-                mq = mm.min(axis=0)
-                mq -= mq.min()
-                msg[(p, q)] = mq
-    A._decode(msg)
-    e = A._fast_total_energy()
-    if e < best - 1e-6:
-        best = e
-        best_snap = [g.current for g in A.groups]
-    if it < 5 or it % 50 == 0:
-        print(f"  [trws it={it}] decode_energy={e:.1f} best={best:.1f} gap={gap(best):+.2f}%", flush=True)
-trws_s = time.perf_counter() - t
-for gid, ci in enumerate(best_snap):
-    A._set_group(gid, ci)
-print(f"[TRW-S] best={best:.1f} gap={gap(best):+.2f}% ({trws_s:.1f}s, {ITERS} iters)", flush=True)
-
-# Polish TRW-S result with the existing local search.
-deadline = time.perf_counter() + 60
-A._memory_repair()
-A._coordinate_descent(deadline)
-A._star_block_search(deadline)
-polished = A._fast_total_energy()
-print(f"[TRW-S + local search] obj={polished:.1f} gap={gap(polished):+.2f}%", flush=True)
-print(f"[RESULT] MODEL={MODEL} mesh={MESH_SHAPE} cbc={obj_cbc} "
-      f"stock_gap={gap(opt.profile['approximate']['objective']):+.2f}% "
-      f"trws_gap={gap(best):+.2f}% trws_ls_gap={gap(polished):+.2f}%", flush=True)
diff --git a/examples/_sanity_llama3.py b/examples/_sanity_llama3.py
deleted file mode 100644
index 6a44b386..00000000
--- a/examples/_sanity_llama3.py
+++ /dev/null
@@ -1,223 +0,0 @@
-"""Real LLaMA3 AutoParallel training sanity check on a 2D or 3D mesh.
-
-Traces the model, picks a sharding strategy with the approximate (TRW-S) solver,
-applies it as DTensor, and trains a fixed random batch for a few steps on real
-GPUs. Pass: the loss curve goes down. Adapted from example_sanity_check_qwen3.py.
-
-The batch is data-parallel on the `dp` axis only; any other axes (`cp`, `tp`)
-are model-sharding axes (the solver shards params/activations over them). Logits
-are vocab-parallel on `tp` and replicated on `cp`, so the loss is reduced over
-the world and normalized by global_token_count * (world_size // dp_degree).
-
-Run: torchrun --standalone --nproc-per-node N examples/_sanity_llama3.py --mesh 2,2,8 --model 8b
-"""
-import argparse
-import logging
-import os
-import time
-
-import torch
-import torch.distributed as dist
-import torch.distributed.nn.functional as dist_nn_func
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-
-from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
-from autoparallel.api import AutoParallel
-
-_CFG = {
-    "1b": dict(dim=2048, n_layers=16, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.5, multiple_of=256),
-    "8b": dict(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, ffn_dim_multiplier=1.3, multiple_of=1024),
-}
-_NAMES = {2: ("dp", "tp"), 3: ("dp", "cp", "tp")}
-
-
-def parse_args():
-    p = argparse.ArgumentParser(description="LLaMA3 AutoParallel training sanity check.")
-    p.add_argument("--model", type=str, default="1b", choices=list(_CFG))
-    p.add_argument("--mesh", type=str, default="2,2", help="comma-separated mesh dims")
-    p.add_argument("--global-batch-size", type=int, default=8)
-    p.add_argument("--microbatch-size", type=int, default=2)
-    p.add_argument("--seq-len", type=int, default=512)
-    p.add_argument("--train-steps", type=int, default=10)
-    p.add_argument("--lr", type=float, default=1e-3)
-    p.add_argument("--max-grad-norm", type=float, default=1.0)
-    p.add_argument("--seed", type=int, default=0)
-    p.add_argument("--solver", type=str, default="approx")
-    p.add_argument("--verbose", action="store_true")
-    return p.parse_args()
-
-
-def init_distributed(args):
-    if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ:
-        raise RuntimeError("Run with torchrun --standalone --nproc-per-node N ...")
-    world_size = int(os.environ["WORLD_SIZE"])
-    local_rank = int(os.environ["LOCAL_RANK"])
-    dims = tuple(int(x) for x in args.mesh.split(","))
-    prod = 1
-    for d in dims:
-        prod *= d
-    if prod != world_size:
-        raise ValueError(f"WORLD_SIZE {world_size} != prod(mesh) {prod}")
-    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
-    dist.init_process_group("nccl", device_id=device)
-    mesh = torch.distributed.device_mesh.init_device_mesh(
-        "cuda", dims, mesh_dim_names=_NAMES[len(dims)]
-    )
-    return device, mesh
-
-
-def placement_for(name, *, is_output):
-    if name == "dp":
-        return Shard(0)
-    if name == "tp" and is_output:
-        return Shard(2)
-    return Replicate()
-
-
-def make_local_tokens(args, mesh, device, vocab_size):
-    names = mesh.mesh_dim_names
-    dp_rank = mesh.get_coordinate()[names.index("dp")]
-    dp_degree = mesh["dp"].size()
-    local_batch_size = args.global_batch_size // dp_degree
-    gen = torch.Generator(device="cpu")
-    gen.manual_seed(args.seed)
-    tokens = torch.randint(
-        0, vocab_size, (args.global_batch_size, args.seq_len + 1),
-        generator=gen, dtype=torch.long,
-    )
-    start = dp_rank * local_batch_size
-    return tokens[start:start + local_batch_size].to(device, non_blocking=True)
-
-
-def vocab_parallel_cross_entropy(logits, labels, *, vocab_size, tp_group, tp_rank,
-                                 tp_degree, normalizer):
-    local_vocab_size = logits.shape[-1]
-    vocab_start = tp_rank * local_vocab_size
-    vocab_stop = vocab_size if tp_rank == tp_degree - 1 else vocab_start + local_vocab_size
-    logits = logits.float()
-    local_max = logits.amax(dim=-1)
-    with torch.no_grad():
-        global_max = local_max.detach().clone()
-        dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group)
-    shifted = logits - global_max.unsqueeze(-1)
-    global_exp_sum = dist_nn_func.all_reduce(
-        shifted.exp().sum(dim=-1), op=dist.ReduceOp.SUM, group=tp_group)
-    mask = (labels >= vocab_start) & (labels < vocab_stop)
-    local_target = torch.zeros_like(labels, dtype=torch.long)
-    local_target[mask] = labels[mask] - vocab_start
-    local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1)
-    local_target_logits = local_target_logits * mask.to(logits.dtype)
-    target_logits = dist_nn_func.all_reduce(
-        local_target_logits, op=dist.ReduceOp.SUM, group=tp_group)
-    loss_sum = (global_exp_sum.log() + global_max - target_logits).sum()
-    return loss_sum / normalizer
-
-
-def print_rank0(msg):
-    if dist.get_rank() == 0:
-        print(msg, flush=True)
-
-
-def main():
-    args = parse_args()
-    logging.basicConfig(level=logging.INFO if args.verbose else logging.WARNING)
-    device, mesh = init_distributed(args)
-    names = mesh.mesh_dim_names
-    world_size = dist.get_world_size()
-    tp_group = mesh.get_group("tp")
-    tp_rank = mesh.get_local_rank("tp")
-    tp_degree = mesh["tp"].size()
-    dp_degree = mesh["dp"].size()
-    local_batch_size = args.global_batch_size // dp_degree
-    grad_accum = local_batch_size // args.microbatch_size
-    # logits are distinct only across dp (cp/tp replicate the per-token loss),
-    # so the world all-reduce over-counts by world_size // dp_degree.
-    normalizer = args.global_batch_size * args.seq_len * (world_size // dp_degree)
-
-    torch.manual_seed(args.seed)
-    model_args = TransformerModelArgs(
-        rope_theta=500000, vocab_size=128256, max_seq_len=args.seq_len, **_CFG[args.model],
-    )
-    trace_global_batch = args.microbatch_size * dp_degree
-
-    with torch.device("meta"):
-        model = Transformer(model_args)
-
-    def input_fn():
-        return torch.randint(0, model_args.vocab_size,
-                             (trace_global_batch, args.seq_len), device=device)
-
-    mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16, reduce_dtype=torch.float32)
-    x_sharding = tuple(placement_for(n, is_output=False) for n in names)
-    out_sharding = tuple(placement_for(n, is_output=True) for n in names)
-    print_rank0(f"LLaMA3-{args.model} sanity: mesh={tuple(mesh.shape)}{names} "
-                f"solver={args.solver} in={x_sharding} out={out_sharding} "
-                f"global_batch={args.global_batch_size} microbatch={args.microbatch_size} "
-                f"grad_accum={grad_accum} seq_len={args.seq_len} steps={args.train_steps} lr={args.lr}")
-
-    t0 = time.time()
-    with AutoParallel(model, input_fn, mesh, mp_policy, repeated_subgraphs=True,
-                      solver=args.solver) as autop:
-        autop.add_parameter_memory_constraint(low=None, high=None)
-        autop.add_input_constraints([x_sharding])
-        autop.add_output_constraints([out_sharding])
-        sharding_placement = autop.optimize_placement(verbose=args.verbose)
-        parallel_mod = autop.apply_placement(sharding_placement)
-    print_rank0(f"trace+optimize+apply took {time.time() - t0:.1f}s")
-
-    parallel_mod.to_empty(device=device)
-    parallel_mod.init_weights(buffer_device=device)
-
-    batch = make_local_tokens(args, mesh, device, model_args.vocab_size)
-    inputs = batch[:, :-1].contiguous()
-    labels = batch[:, 1:].contiguous()
-    input_mbs = inputs.split(args.microbatch_size, dim=0)
-    label_mbs = labels.split(args.microbatch_size, dim=0)
-    optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr)
-
-    try:
-        losses = []
-        step_times = []
-        for step in range(args.train_steps):
-            torch.cuda.synchronize(device)
-            t_step = time.perf_counter()
-            optimizer.zero_grad(set_to_none=True)
-            step_loss = torch.zeros((), device=device)
-            for mi, ml in zip(input_mbs, label_mbs):
-                logits = parallel_mod(mi)
-                loss = vocab_parallel_cross_entropy(
-                    logits, ml, vocab_size=model_args.vocab_size, tp_group=tp_group,
-                    tp_rank=tp_rank, tp_degree=tp_degree, normalizer=normalizer)
-                loss.backward()
-                step_loss = step_loss + loss.detach()
-            torch.nn.utils.clip_grad_norm_(parallel_mod.parameters(), args.max_grad_norm)
-            optimizer.step()
-            torch.cuda.synchronize(device)
-            step_times.append(time.perf_counter() - t_step)
-            with torch.no_grad():
-                logged = step_loss.clone()
-                dist.all_reduce(logged, op=dist.ReduceOp.SUM)
-            losses.append(float(logged.item()))
-            print_rank0(f"step={step:03d} loss={losses[-1]:.6f} step_time={1000*step_times[-1]:.0f}ms")
-
-        warmup = min(3, max(0, len(step_times) - 2))
-        steady = sorted(step_times[warmup:])
-        if steady:
-            mean_ms = 1000 * sum(steady) / len(steady)
-            print_rank0(f"[latency] solver={args.solver} per-step (excl {warmup} warmup, "
-                        f"{len(steady)} steps): mean={mean_ms:.0f}ms "
-                        f"median={1000*steady[len(steady)//2]:.0f}ms min={1000*steady[0]:.0f}ms")
-        print_rank0(f"\nloss curve: {[round(x, 4) for x in losses]}")
-        verdict = "PASS" if losses[-1] < losses[0] else "FAIL"
-        print_rank0(f"SANITY {verdict}: loss {losses[0]:.4f} -> {losses[-1]:.4f}")
-        dist.barrier(device_ids=[device.index])
-        torch.cuda.synchronize(device)
-    finally:
-        if dist.is_initialized():
-            dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/example_llama3.py b/examples/example_llama3.py
index 0879e568..00025f5c 100644
--- a/examples/example_llama3.py
+++ b/examples/example_llama3.py
@@ -28,9 +28,6 @@
 )
 from autoparallel.graph_passes.debug_helpers import make_custom_runtime_estimation
 from autoparallel.graph_passes.estimate_graph_metrics import estimate_graph_metrics
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Partial, Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
 
 logging.basicConfig(level=logging.DEBUG)
 
diff --git a/examples/example_qwen3.py b/examples/example_qwen3.py
deleted file mode 100644
index 2ae57b00..00000000
--- a/examples/example_qwen3.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-import time
-
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-from autoparallel._testing.models.qwen3 import (
-    Qwen3ModelArgs,
-    Transformer,
-    qwen3_235b_a22b_args,
-    qwen3_30b_a3b_args,
-    qwen3_8b_args,
-    qwen3_debug_args,
-    qwen3_moe_debug_args,
-)
-from autoparallel.api import AutoParallel
-from autoparallel.compile import autoparallel_backend
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Trace, optimize, and smoke-test dense Qwen3 with AutoParallel."
-    )
-    parser.add_argument(
-        "--flavor",
-        choices=("tiny", "moe-tiny", "debug", "8b", "moe-debug", "30b-a3b", "235b-a22b"),
-        default="tiny",
-        help="Qwen3 model size to instantiate. Defaults to tiny for faster runs.",
-    )
-    parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=None,
-        help="Sequence length. Defaults to 8 for tiny, 512 for debug, and 4096 for 8b.",
-    )
-    parser.add_argument(
-        "--world-size",
-        type=int,
-        default=64,
-        help="Fake process-group world size.",
-    )
-    parser.add_argument(
-        "--tp-degree",
-        type=int,
-        default=8,
-        help="Second mesh degree. Used as TP for dense flavors and EP for MoE flavors.",
-    )
-    parser.add_argument(
-        "--local-batch-size",
-        type=int,
-        default=2,
-        help="Per-DP-rank batch size used for the runtime smoke pass.",
-    )
-    parser.add_argument(
-        "--save-optimizer",
-        type=str,
-        default=None,
-        help="Optional path for the serialized sharding optimizer state.",
-    )
-    parser.add_argument(
-        "--compile",
-        action="store_true",
-        help="Compile the placed module with the AutoParallel backend before running.",
-    )
-    parser.add_argument(
-        "--skip-run",
-        action="store_true",
-        help="Only run tracing, optimization, and placement application.",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="Print the full AutoParallel optimizer log.",
-    )
-    return parser.parse_args()
-
-
-def make_model_args(flavor: str, seq_len: int):
-    if flavor == "tiny":
-        return Qwen3ModelArgs(
-            dim=64,
-            n_layers=2,
-            n_heads=4,
-            n_kv_heads=2,
-            head_dim=16,
-            hidden_dim=128,
-            vocab_size=128,
-            max_seq_len=seq_len,
-        )
-    if flavor == "moe-tiny":
-        return Qwen3ModelArgs(
-            dim=64,
-            n_layers=1,
-            n_heads=4,
-            n_kv_heads=2,
-            head_dim=16,
-            hidden_dim=128,
-            vocab_size=128,
-            max_seq_len=seq_len,
-            moe_enabled=True,
-            moe_hidden_dim=32,
-            num_experts=8,
-            top_k=2,
-            route_norm=True,
-            score_before_experts=False,
-        )
-    if flavor == "debug":
-        return qwen3_debug_args(max_seq_len=seq_len)
-    if flavor == "8b":
-        return qwen3_8b_args(max_seq_len=seq_len)
-    if flavor == "moe-debug":
-        return qwen3_moe_debug_args(max_seq_len=seq_len)
-    if flavor == "30b-a3b":
-        return qwen3_30b_a3b_args(max_seq_len=seq_len)
-    if flavor == "235b-a22b":
-        return qwen3_235b_a22b_args(max_seq_len=seq_len)
-    raise ValueError(f"Unknown Qwen3 flavor: {flavor}")
-
-
-def main():
-    args = parse_args()
-    logging.basicConfig(level=logging.DEBUG)
-
-    seq_len = args.seq_len
-    if seq_len is None:
-        seq_len = {
-            "tiny": 8,
-            "moe-tiny": 8,
-            "debug": 512,
-            "8b": 4096,
-            "moe-debug": 512,
-            "30b-a3b": 4096,
-            "235b-a22b": 4096,
-        }[args.flavor]
-    if args.world_size % args.tp_degree != 0:
-        raise ValueError(
-            f"world-size ({args.world_size}) must be divisible by "
-            f"tp-degree ({args.tp_degree})."
-        )
-
-    if not torch.distributed.is_initialized():
-        fake_store = FakeStore()
-        torch.distributed.init_process_group(
-            "fake",
-            store=fake_store,
-            rank=0,
-            world_size=args.world_size,
-        )
-
-    model_args = make_model_args(args.flavor, seq_len)
-    mesh_dim_names = ("dp", "ep") if model_args.moe_enabled else ("dp", "tp")
-    mesh = torch.distributed.device_mesh.init_device_mesh(
-        "cuda",
-        (args.world_size // args.tp_degree, args.tp_degree),
-        mesh_dim_names=mesh_dim_names,
-    )
-    device = torch.device("cuda")
-
-    global_batch_size = args.local_batch_size * mesh.shape[0]
-    if model_args.moe_enabled:
-        global_batch_size *= mesh.shape[1]
-
-    with torch.device("meta"):
-        model = Transformer(
-            model_args,
-            mesh=mesh if model_args.moe_enabled else None,
-            moe_axis_name=mesh.mesh_dim_names[1],
-        )
-
-    def input_fn():
-        return torch.randint(
-            0,
-            model_args.vocab_size,
-            (global_batch_size, seq_len),
-            device=device,
-        )
-
-    mp_policy = MixedPrecisionPolicy(
-        param_dtype=torch.bfloat16,
-        reduce_dtype=torch.float32,
-    )
-
-    t0 = time.time()
-    with AutoParallel(
-        model,
-        input_fn,
-        mesh,
-        mp_policy,
-        dynamic=model_args.moe_enabled,
-        repeated_subgraphs=True,
-    ) as autop:
-        autop.add_parameter_memory_constraint(low=None, high=None)
-
-        x_sharding = (Shard(0), Shard(0)) if model_args.moe_enabled else (Shard(0), Replicate())
-        out_sharding = (Shard(0), Shard(2))
-        autop.add_input_constraints([x_sharding])
-        autop.add_output_constraints([out_sharding])
-
-        sharding_placement = autop.optimize_placement(verbose=args.verbose)
-        print(f"Tracing + optimization took {time.time() - t0:.1f}s")
-
-        if args.save_optimizer is not None:
-            autop.sharding_optimizer.save(args.save_optimizer)
-            autop.sharding_optimizer.save_placements(
-                f"{args.save_optimizer}.placements.json"
-            )
-
-        parallel_mod = autop.apply_placement(sharding_placement)
-
-    if args.skip_run:
-        print("Placement applied successfully.")
-        return
-
-    parallel_mod.to_empty(device=device)
-    parallel_mod.init_weights(buffer_device=device)  # type: ignore[operator]
-
-    if args.compile:
-        parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend())
-
-    tokens = torch.randint(
-        0,
-        model_args.vocab_size,
-        (args.local_batch_size, seq_len),
-        device=device,
-    )
-    out = parallel_mod(tokens)
-    if torch.any(torch.isnan(out)):
-        raise RuntimeError("Found NaNs in Qwen3 forward output.")
-    out.backward(torch.randn_like(out))
-    print("All good!")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/example_sanity_check_qwen3.py b/examples/example_sanity_check_qwen3.py
deleted file mode 100644
index b7af6c0d..00000000
--- a/examples/example_sanity_check_qwen3.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-import os
-import time
-
-import torch
-import torch.distributed as dist
-import torch.distributed.nn.functional as dist_nn_func
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-
-from autoparallel._testing.models.qwen3 import Transformer, qwen3_8b_args
-from autoparallel.api import AutoParallel
-from autoparallel.compile import autoparallel_backend
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Run a real Qwen3 8B AutoParallel training sanity check."
-    )
-    parser.add_argument(
-        "--global-batch-size",
-        type=int,
-        default=16,
-        help="Global batch size across data-parallel ranks.",
-    )
-    parser.add_argument(
-        "--microbatch-size",
-        type=int,
-        default=1,
-        help="Per-DP-rank microbatch size for gradient accumulation.",
-    )
-    parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=4096,
-        help="Sequence length. Defaults to Qwen3 8B's max sequence length.",
-    )
-    parser.add_argument(
-        "--dp-degree",
-        type=int,
-        default=2,
-        help="Data-parallel mesh degree.",
-    )
-    parser.add_argument(
-        "--tp-degree",
-        type=int,
-        default=2,
-        help="Tensor-parallel mesh degree.",
-    )
-    parser.add_argument(
-        "--train-steps",
-        type=int,
-        default=20,
-        help="Number of optimizer steps.",
-    )
-    parser.add_argument(
-        "--lr",
-        type=float,
-        default=3e-4,
-        help="AdamW learning rate.",
-    )
-    parser.add_argument(
-        "--max-grad-norm",
-        type=float,
-        default=1.0,
-        help="Gradient clipping max norm.",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=0,
-        help="Seed for model initialization and synthetic data generation.",
-    )
-    parser.add_argument(
-        "--compile",
-        action="store_true",
-        help="Compile the placed module with the AutoParallel backend before training.",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="Print the full AutoParallel optimizer log.",
-    )
-    return parser.parse_args()
-
-
-def init_distributed(args):
-    if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ:
-        raise RuntimeError(
-            "Run this example with torchrun, e.g. "
-            "torchrun --standalone --nproc-per-node 4 "
-            "examples/example_sanity_check_qwen3.py"
-        )
-
-    world_size = int(os.environ["WORLD_SIZE"])
-    local_rank = int(os.environ["LOCAL_RANK"])
-    expected_world_size = args.dp_degree * args.tp_degree
-    if world_size != expected_world_size:
-        raise ValueError(
-            f"WORLD_SIZE ({world_size}) must equal dp-degree * tp-degree "
-            f"({args.dp_degree} * {args.tp_degree} = {expected_world_size})."
-        )
-    if args.global_batch_size % args.dp_degree != 0:
-        raise ValueError(
-            f"global-batch-size ({args.global_batch_size}) must be divisible by "
-            f"dp-degree ({args.dp_degree})."
-        )
-    local_batch_size = args.global_batch_size // args.dp_degree
-    if local_batch_size % args.microbatch_size != 0:
-        raise ValueError(
-            f"local batch size ({local_batch_size}) must be divisible by "
-            f"microbatch-size ({args.microbatch_size})."
-        )
-
-    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
-    dist.init_process_group("nccl", device_id=device)
-    mesh = torch.distributed.device_mesh.init_device_mesh(
-        "cuda",
-        (args.dp_degree, args.tp_degree),
-        mesh_dim_names=("dp", "tp"),
-    )
-    return device, mesh
-
-
-def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor:
-    coordinate = mesh.get_coordinate()
-    if coordinate is None:
-        raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.")
-    dp_rank, _tp_rank = coordinate
-    local_batch_size = args.global_batch_size // args.dp_degree
-
-    generator = torch.Generator(device="cpu")
-    generator.manual_seed(args.seed)
-    tokens = torch.randint(
-        0,
-        vocab_size,
-        (args.global_batch_size, args.seq_len + 1),
-        generator=generator,
-        dtype=torch.long,
-    )
-
-    start = dp_rank * local_batch_size
-    stop = start + local_batch_size
-    return tokens[start:stop].to(device, non_blocking=True)
-
-
-def vocab_parallel_cross_entropy(
-    logits: torch.Tensor,
-    labels: torch.Tensor,
-    *,
-    vocab_size: int,
-    tp_group,
-    tp_rank: int,
-    tp_degree: int,
-    global_token_count: int,
-) -> torch.Tensor:
-    if logits.shape[:2] != labels.shape:
-        raise ValueError(
-            f"logits shape {tuple(logits.shape)} is incompatible with "
-            f"labels shape {tuple(labels.shape)}."
-        )
-
-    local_vocab_size = logits.shape[-1]
-    vocab_start = tp_rank * local_vocab_size
-    vocab_stop = vocab_start + local_vocab_size
-    if tp_rank == tp_degree - 1:
-        vocab_stop = vocab_size
-
-    logits = logits.float()
-    local_max = logits.amax(dim=-1)
-    with torch.no_grad():
-        global_max = local_max.detach().clone()
-        dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group)
-
-    shifted_logits = logits - global_max.unsqueeze(-1)
-    local_exp_sum = shifted_logits.exp().sum(dim=-1)
-    global_exp_sum = dist_nn_func.all_reduce(
-        local_exp_sum,
-        op=dist.ReduceOp.SUM,
-        group=tp_group,
-    )
-
-    target_mask = (labels >= vocab_start) & (labels < vocab_stop)
-    local_target = torch.zeros_like(labels, dtype=torch.long)
-    local_target[target_mask] = labels[target_mask] - vocab_start
-    local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1)
-    local_target_logits = local_target_logits * target_mask.to(logits.dtype)
-    target_logits = dist_nn_func.all_reduce(
-        local_target_logits,
-        op=dist.ReduceOp.SUM,
-        group=tp_group,
-    )
-
-    loss_sum = (global_exp_sum.log() + global_max - target_logits).sum()
-    return loss_sum / (global_token_count * tp_degree)
-
-
-def print_rank0(message: str) -> None:
-    if dist.get_rank() == 0:
-        print(message, flush=True)
-
-
-def main():
-    args = parse_args()
-    logging.basicConfig(level=logging.DEBUG)
-
-    device, mesh = init_distributed(args)
-    tp_group = mesh.get_group("tp")
-    tp_rank = mesh.get_local_rank("tp")
-    local_batch_size = args.global_batch_size // args.dp_degree
-    gradient_accumulation_steps = local_batch_size // args.microbatch_size
-
-    torch.manual_seed(args.seed)
-    model_args = qwen3_8b_args(max_seq_len=args.seq_len)
-    trace_global_batch_size = args.microbatch_size * args.dp_degree
-
-    with torch.device("meta"):
-        model = Transformer(model_args)
-
-    def input_fn():
-        return torch.randint(
-            0,
-            model_args.vocab_size,
-            (trace_global_batch_size, args.seq_len),
-            device=device,
-        )
-
-    mp_policy = MixedPrecisionPolicy(
-        param_dtype=torch.bfloat16,
-        reduce_dtype=torch.float32,
-    )
-
-    print_rank0(
-        "Qwen3 8B sanity check: "
-        f"mesh=(dp={args.dp_degree}, tp={args.tp_degree}), "
-        f"global_batch={args.global_batch_size}, "
-        f"local_batch={local_batch_size}, "
-        f"microbatch={args.microbatch_size}, "
-        f"grad_accum={gradient_accumulation_steps}, "
-        f"trace_global_batch={trace_global_batch_size}, "
-        f"seq_len={args.seq_len}"
-    )
-
-    t0 = time.time()
-    with AutoParallel(
-        model,
-        input_fn,
-        mesh,
-        mp_policy,
-        repeated_subgraphs=True,
-    ) as autop:
-        autop.add_parameter_memory_constraint(low=None, high=None)
-        autop.add_input_constraints([(Shard(0), Replicate())])
-        autop.add_output_constraints([(Shard(0), Shard(2))])
-        sharding_placement = autop.optimize_placement(verbose=args.verbose)
-        parallel_mod = autop.apply_placement(sharding_placement)
-
-    print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s")
-
-    parallel_mod.to_empty(device=device)
-    parallel_mod.init_weights(buffer_device=device, seed=args.seed)  # type: ignore[operator]
-
-    if args.compile:
-        parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend())
-
-    batch = make_local_tokens(args, mesh, device, model_args.vocab_size)
-    inputs = batch[:, :-1].contiguous()
-    labels = batch[:, 1:].contiguous()
-    input_microbatches = inputs.split(args.microbatch_size, dim=0)
-    label_microbatches = labels.split(args.microbatch_size, dim=0)
-    global_token_count = args.global_batch_size * args.seq_len
-    optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr)
-
-    try:
-        losses: list[float] = []
-        for step in range(args.train_steps):
-            optimizer.zero_grad(set_to_none=True)
-            step_loss = torch.zeros((), device=device)
-            for micro_inputs, micro_labels in zip(
-                input_microbatches, label_microbatches
-            ):
-                logits = parallel_mod(micro_inputs)
-                if torch.any(torch.isnan(logits)):
-                    raise RuntimeError("Found NaNs in Qwen3 forward output.")
-
-                loss = vocab_parallel_cross_entropy(
-                    logits,
-                    micro_labels,
-                    vocab_size=model_args.vocab_size,
-                    tp_group=tp_group,
-                    tp_rank=tp_rank,
-                    tp_degree=args.tp_degree,
-                    global_token_count=global_token_count,
-                )
-                if torch.any(torch.isnan(loss)):
-                    raise RuntimeError("Found NaNs in Qwen3 training loss.")
-
-                loss.backward()
-                step_loss = step_loss + loss.detach()
-
-            torch.nn.utils.clip_grad_norm_(
-                parallel_mod.parameters(), args.max_grad_norm
-            )
-            optimizer.step()
-
-            with torch.no_grad():
-                logged_loss = step_loss.clone()
-                dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM)
-                loss_value = float(logged_loss.item())
-            losses.append(loss_value)
-            print_rank0(f"step={step:03d} loss={loss_value:.6f}")
-
-        if losses[-1] >= losses[0]:
-            raise RuntimeError(
-                f"Qwen3 training loss did not improve: initial={losses[0]:.6f}, "
-                f"final={losses[-1]:.6f}"
-            )
-
-        print_rank0(f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}")
-        dist.barrier(device_ids=[device.index])
-        torch.cuda.synchronize(device)
-    finally:
-        if dist.is_initialized():
-            dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/example_sanity_check_qwen3_moe.py b/examples/example_sanity_check_qwen3_moe.py
deleted file mode 100644
index dd16afb7..00000000
--- a/examples/example_sanity_check_qwen3_moe.py
+++ /dev/null
@@ -1,466 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-import os
-import time
-
-import torch
-import torch.distributed as dist
-import torch.distributed.nn.functional as dist_nn_func
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Shard
-
-from autoparallel._testing.models.qwen3 import (
-    Qwen3ModelArgs,
-    Transformer,
-    qwen3_235b_a22b_args,
-    qwen3_30b_a3b_args,
-    qwen3_moe_debug_args,
-)
-from autoparallel.api import AutoParallel
-from autoparallel.compile import autoparallel_backend
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Run a real Qwen3 MoE AutoParallel training sanity check."
-    )
-    parser.add_argument(
-        "--flavor",
-        choices=("moe-tiny", "moe-debug", "30b-a3b", "235b-a22b"),
-        default="30b-a3b",
-        help="Qwen3 MoE model size. Defaults to the real Qwen3-30B-A3B model.",
-    )
-    parser.add_argument(
-        "--global-batch-size",
-        type=int,
-        default=4,
-        help="Global batch size across data-parallel ranks.",
-    )
-    parser.add_argument(
-        "--microbatch-size",
-        type=int,
-        default=1,
-        help="Per-rank input microbatch size before EP all-gather inside the model.",
-    )
-    parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=8192,
-        help="Sequence length. Defaults to 8192 for the 4xH100 sanity run.",
-    )
-    parser.add_argument(
-        "--dp-degree",
-        type=int,
-        default=2,
-        help="Data-parallel mesh degree.",
-    )
-    parser.add_argument(
-        "--ep-degree",
-        type=int,
-        default=2,
-        help="Expert-parallel mesh degree.",
-    )
-    parser.add_argument(
-        "--train-steps",
-        type=int,
-        default=30,
-        help="Number of optimizer steps.",
-    )
-    parser.add_argument(
-        "--lr",
-        type=float,
-        default=3e-4,
-        help="Optimizer learning rate.",
-    )
-    parser.add_argument(
-        "--optimizer",
-        choices=("adamw", "sgd", "none"),
-        default="adamw",
-        help="Optimizer to use after backward. Use sgd/none for large-model memory smoke runs.",
-    )
-    parser.add_argument(
-        "--max-grad-norm",
-        type=float,
-        default=1.0,
-        help="Gradient clipping max norm.",
-    )
-    parser.add_argument(
-        "--loss-chunk-size",
-        type=int,
-        default=512,
-        help=(
-            "Sequence chunk size for vocab-parallel cross entropy. "
-            "Keeps the 8192-token real-model run from materializing full-size "
-            "float logits and exp buffers at once."
-        ),
-    )
-    parser.add_argument(
-        "--skip-loss-improvement-check",
-        action="store_true",
-        help="Only require finite forward/backward/optimizer steps.",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=0,
-        help="Seed for model initialization and synthetic data generation.",
-    )
-    parser.add_argument(
-        "--compile",
-        action="store_true",
-        help="Compile the placed module with the AutoParallel backend before training.",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="Print the full AutoParallel optimizer log.",
-    )
-    return parser.parse_args()
-
-
-def make_model_args(flavor: str, seq_len: int | None) -> Qwen3ModelArgs:
-    if flavor == "moe-tiny":
-        max_seq_len = 512 if seq_len is None else seq_len
-        return Qwen3ModelArgs(
-            dim=64,
-            n_layers=1,
-            n_heads=4,
-            n_kv_heads=2,
-            head_dim=16,
-            hidden_dim=128,
-            vocab_size=128,
-            max_seq_len=max_seq_len,
-            moe_enabled=True,
-            moe_hidden_dim=32,
-            num_experts=8,
-            top_k=2,
-            route_norm=True,
-            score_before_experts=False,
-            moe_axis_name="ep",
-        )
-    overrides = {"moe_axis_name": "ep"}
-    if seq_len is not None:
-        overrides["max_seq_len"] = seq_len
-    if flavor == "moe-debug":
-        return qwen3_moe_debug_args(**overrides)
-    if flavor == "30b-a3b":
-        return qwen3_30b_a3b_args(**overrides)
-    if flavor == "235b-a22b":
-        return qwen3_235b_a22b_args(**overrides)
-    raise ValueError(f"Unknown Qwen3 MoE flavor: {flavor}")
-
-
-def init_distributed(args):
-    if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ:
-        raise RuntimeError(
-            "Run this example with torchrun, e.g. "
-            "torchrun --standalone --nproc-per-node 4 "
-            "examples/example_sanity_check_qwen3_moe.py"
-        )
-
-    world_size = int(os.environ["WORLD_SIZE"])
-    local_rank = int(os.environ["LOCAL_RANK"])
-    expected_world_size = args.dp_degree * args.ep_degree
-    if world_size != expected_world_size:
-        raise ValueError(
-            f"WORLD_SIZE ({world_size}) must equal dp-degree * ep-degree "
-            f"({args.dp_degree} * {args.ep_degree} = {expected_world_size})."
-        )
-    if args.global_batch_size % args.dp_degree != 0:
-        raise ValueError(
-            f"global-batch-size ({args.global_batch_size}) must be divisible by "
-            f"dp-degree ({args.dp_degree})."
-        )
-
-    local_dp_batch_size = args.global_batch_size // args.dp_degree
-    local_dp_microbatch = args.microbatch_size * args.ep_degree
-    if local_dp_batch_size % local_dp_microbatch != 0:
-        raise ValueError(
-            f"local DP batch size ({local_dp_batch_size}) must be divisible by "
-            f"microbatch-size * ep-degree "
-            f"({args.microbatch_size} * {args.ep_degree} = {local_dp_microbatch})."
-        )
-
-    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
-    dist.init_process_group("nccl", device_id=device)
-    mesh = torch.distributed.device_mesh.init_device_mesh(
-        "cuda",
-        (args.dp_degree, args.ep_degree),
-        mesh_dim_names=("dp", "ep"),
-    )
-    return device, mesh
-
-
-def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor:
-    coordinate = mesh.get_coordinate()
-    if coordinate is None:
-        raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.")
-    dp_rank, _ep_rank = coordinate
-    local_dp_batch_size = args.global_batch_size // args.dp_degree
-
-    generator = torch.Generator(device="cpu")
-    generator.manual_seed(args.seed)
-    tokens = torch.randint(
-        0,
-        vocab_size,
-        (args.global_batch_size, args.seq_len + 1),
-        generator=generator,
-        dtype=torch.long,
-    )
-
-    start = dp_rank * local_dp_batch_size
-    stop = start + local_dp_batch_size
-    return tokens[start:stop].to(device, non_blocking=True)
-
-
-def vocab_parallel_cross_entropy(
-    logits: torch.Tensor,
-    labels: torch.Tensor,
-    *,
-    vocab_size: int,
-    vocab_group,
-    vocab_rank: int,
-    vocab_degree: int,
-    global_token_count: int,
-) -> torch.Tensor:
-    if logits.shape[:2] != labels.shape:
-        raise ValueError(
-            f"logits shape {tuple(logits.shape)} is incompatible with "
-            f"labels shape {tuple(labels.shape)}."
-        )
-
-    local_vocab_size = logits.shape[-1]
-    vocab_start = vocab_rank * local_vocab_size
-    vocab_stop = vocab_start + local_vocab_size
-    if vocab_rank == vocab_degree - 1:
-        vocab_stop = vocab_size
-
-    logits = logits.float()
-    local_max = logits.amax(dim=-1)
-    with torch.no_grad():
-        global_max = local_max.detach().clone()
-        dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=vocab_group)
-
-    shifted_logits = logits - global_max.unsqueeze(-1)
-    local_exp_sum = shifted_logits.exp().sum(dim=-1)
-    global_exp_sum = dist_nn_func.all_reduce(
-        local_exp_sum,
-        op=dist.ReduceOp.SUM,
-        group=vocab_group,
-    )
-
-    target_mask = (labels >= vocab_start) & (labels < vocab_stop)
-    local_target = torch.zeros_like(labels, dtype=torch.long)
-    local_target[target_mask] = labels[target_mask] - vocab_start
-    local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1)
-    local_target_logits = local_target_logits * target_mask.to(logits.dtype)
-    target_logits = dist_nn_func.all_reduce(
-        local_target_logits,
-        op=dist.ReduceOp.SUM,
-        group=vocab_group,
-    )
-
-    loss_sum = (global_exp_sum.log() + global_max - target_logits).sum()
-    return loss_sum / (global_token_count * vocab_degree)
-
-
-def chunk_ranges(size: int, chunk_size: int):
-    if chunk_size <= 0:
-        yield 0, size
-        return
-    for start in range(0, size, chunk_size):
-        yield start, min(start + chunk_size, size)
-
-
-def print_rank0(message: str) -> None:
-    if dist.get_rank() == 0:
-        print(message, flush=True)
-
-
-def print_cuda_memory(stage: str, device: torch.device) -> None:
-    allocated = torch.cuda.memory_allocated(device) / 1024**3
-    reserved = torch.cuda.memory_reserved(device) / 1024**3
-    max_reserved = torch.cuda.max_memory_reserved(device) / 1024**3
-    print_rank0(
-        f"{stage}: cuda allocated={allocated:.2f}GiB "
-        f"reserved={reserved:.2f}GiB max_reserved={max_reserved:.2f}GiB"
-    )
-
-
-def main():
-    args = parse_args()
-    logging.basicConfig(level=logging.DEBUG)
-
-    device, mesh = init_distributed(args)
-    ep_group = mesh.get_group("ep")
-    ep_rank = mesh.get_local_rank("ep")
-    local_dp_batch_size = args.global_batch_size // args.dp_degree
-    local_dp_microbatch = args.microbatch_size * args.ep_degree
-    gradient_accumulation_steps = local_dp_batch_size // local_dp_microbatch
-
-    torch.manual_seed(args.seed)
-    model_args = make_model_args(args.flavor, args.seq_len)
-    if args.seq_len is None:
-        args.seq_len = model_args.max_seq_len
-    if model_args.num_experts % args.ep_degree != 0:
-        raise ValueError(
-            f"num_experts ({model_args.num_experts}) must be divisible by "
-            f"ep-degree ({args.ep_degree})."
-        )
-    trace_global_batch_size = args.microbatch_size * args.dp_degree * args.ep_degree
-
-    with torch.device("meta"):
-        model = Transformer(model_args, mesh=mesh, moe_axis_name="ep")
-
-    def input_fn():
-        return torch.randint(
-            0,
-            model_args.vocab_size,
-            (trace_global_batch_size, args.seq_len),
-            device=device,
-        )
-
-    mp_policy = MixedPrecisionPolicy(
-        param_dtype=torch.bfloat16,
-        reduce_dtype=torch.float32,
-    )
-
-    print_rank0(
-        f"Qwen3 {args.flavor} sanity check: "
-        f"mesh=(dp={args.dp_degree}, ep={args.ep_degree}), "
-        f"global_batch={args.global_batch_size}, "
-        f"local_dp_batch={local_dp_batch_size}, "
-        f"per_rank_microbatch={args.microbatch_size}, "
-        f"local_dp_microbatch={local_dp_microbatch}, "
-        f"grad_accum={gradient_accumulation_steps}, "
-        f"trace_global_batch={trace_global_batch_size}, "
-        f"seq_len={args.seq_len}, "
-        f"loss_chunk_size={args.loss_chunk_size}, "
-        f"optimizer={args.optimizer}"
-    )
-
-    t0 = time.time()
-    with AutoParallel(
-        model,
-        input_fn,
-        mesh,
-        mp_policy,
-        dynamic=True,
-        repeated_subgraphs=True,
-    ) as autop:
-        autop.add_parameter_memory_constraint(low=None, high=None)
-        autop.add_input_constraints([(Shard(0), Shard(0))])
-        autop.add_output_constraints([(Shard(0), Shard(2))])
-        sharding_placement = autop.optimize_placement(verbose=args.verbose)
-        parallel_mod = autop.apply_placement(sharding_placement)
-
-    print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s")
-    print_cuda_memory("after AutoParallel", device)
-
-    parallel_mod.to_empty(device=device)
-    print_cuda_memory("after to_empty", device)
-    parallel_mod.init_weights(buffer_device=device, seed=args.seed)  # type: ignore[operator]
-    print_cuda_memory("after init_weights", device)
-
-    if args.compile:
-        parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend())
-
-    batch = make_local_tokens(args, mesh, device, model_args.vocab_size)
-    inputs = batch[:, :-1].contiguous()
-    labels = batch[:, 1:].contiguous()
-
-    ep_coordinate = mesh.get_coordinate()[1]
-    input_microbatches = []
-    label_microbatches = []
-    for start in range(0, local_dp_batch_size, local_dp_microbatch):
-        stop = start + local_dp_microbatch
-        input_block = inputs[start:stop]
-        input_start = ep_coordinate * args.microbatch_size
-        input_stop = input_start + args.microbatch_size
-        input_microbatches.append(input_block[input_start:input_stop].contiguous())
-        label_microbatches.append(labels[start:stop].contiguous())
-
-    global_token_count = args.global_batch_size * args.seq_len
-    if args.optimizer == "adamw":
-        optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr)
-    elif args.optimizer == "sgd":
-        optimizer = torch.optim.SGD(parallel_mod.parameters(), lr=args.lr)
-    else:
-        optimizer = None
-
-    try:
-        losses: list[float] = []
-        for step in range(args.train_steps):
-            if optimizer is not None:
-                optimizer.zero_grad(set_to_none=True)
-            else:
-                parallel_mod.zero_grad(set_to_none=True)
-            step_loss = torch.zeros((), device=device)
-            for micro_inputs, micro_labels in zip(
-                input_microbatches, label_microbatches
-            ):
-                logits = parallel_mod(micro_inputs)
-
-                seq_ranges = list(chunk_ranges(logits.shape[1], args.loss_chunk_size))
-                for chunk_idx, (seq_start, seq_stop) in enumerate(seq_ranges):
-                    logits_chunk = logits[:, seq_start:seq_stop]
-                    labels_chunk = micro_labels[:, seq_start:seq_stop]
-                    loss = vocab_parallel_cross_entropy(
-                        logits_chunk,
-                        labels_chunk,
-                        vocab_size=model_args.vocab_size,
-                        vocab_group=ep_group,
-                        vocab_rank=ep_rank,
-                        vocab_degree=args.ep_degree,
-                        global_token_count=global_token_count,
-                    )
-                    if torch.any(torch.isnan(loss)):
-                        raise RuntimeError("Found NaNs in Qwen3 MoE training loss.")
-
-                    retain_graph = chunk_idx != len(seq_ranges) - 1
-                    loss.backward(retain_graph=retain_graph)
-                    step_loss = step_loss + loss.detach()
-
-            torch.nn.utils.clip_grad_norm_(
-                parallel_mod.parameters(), args.max_grad_norm
-            )
-            if optimizer is not None:
-                optimizer.step()
-
-            with torch.no_grad():
-                logged_loss = step_loss.clone()
-                dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM)
-                loss_value = float(logged_loss.item())
-            losses.append(loss_value)
-            print_rank0(f"step={step:03d} loss={loss_value:.6f}")
-            print_cuda_memory(f"after step {step:03d}", device)
-
-        if (
-            not args.skip_loss_improvement_check
-            and len(losses) > 1
-            and losses[-1] >= losses[0]
-        ):
-            raise RuntimeError(
-                f"Qwen3 MoE training loss did not improve: "
-                f"initial={losses[0]:.6f}, final={losses[-1]:.6f}"
-            )
-
-        if len(losses) > 1:
-            print_rank0(
-                f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}"
-            )
-        dist.barrier(device_ids=[device.index])
-        torch.cuda.synchronize(device)
-    finally:
-        if dist.is_initialized():
-            dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/example_torchtitan_qwen3_dense.py b/examples/example_torchtitan_qwen3_dense.py
deleted file mode 100644
index a4685d1b..00000000
--- a/examples/example_torchtitan_qwen3_dense.py
+++ /dev/null
@@ -1,370 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import dataclasses
-import logging
-import os
-import sys
-import time
-from pathlib import Path
-
-import torch
-import torch.distributed as dist
-import torch.distributed.nn.functional as dist_nn_func
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor.placement_types import Replicate, Shard
-
-from autoparallel.api import AutoParallel
-from autoparallel.compile import autoparallel_backend
-
-
-def _add_sibling_torchtitan_to_path() -> None:
-    repo_root = Path(__file__).resolve().parents[1]
-    torchtitan_root = repo_root.parent / "torchtitan"
-    if torchtitan_root.exists():
-        sys.path.insert(0, str(torchtitan_root))
-
-
-_add_sibling_torchtitan_to_path()
-
-from torchtitan.models.qwen3 import Qwen3Model, qwen3_configs  # noqa: E402
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description=(
-            "Run torchtitan's dense Qwen3 model through AutoParallel's "
-            "searched placement on real GPUs."
-        )
-    )
-    parser.add_argument(
-        "--flavor",
-        choices=("debugmodel", "debugmodel_fused_qkv", "0.6B", "1.7B", "4B", "8B"),
-        default="8B",
-        help="Dense torchtitan Qwen3 flavor.",
-    )
-    parser.add_argument(
-        "--global-batch-size",
-        type=int,
-        default=4,
-        help="Global batch size across data-parallel ranks.",
-    )
-    parser.add_argument(
-        "--microbatch-size",
-        type=int,
-        default=1,
-        help="Per-DP-rank microbatch size for gradient accumulation.",
-    )
-    parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=2048,
-        help="Sequence length for the real sanity run.",
-    )
-    parser.add_argument(
-        "--dp-degree",
-        type=int,
-        default=2,
-        help="Data-parallel mesh degree.",
-    )
-    parser.add_argument(
-        "--tp-degree",
-        type=int,
-        default=2,
-        help="Tensor-parallel mesh degree.",
-    )
-    parser.add_argument(
-        "--train-steps",
-        type=int,
-        default=2,
-        help="Number of optimizer steps.",
-    )
-    parser.add_argument(
-        "--lr",
-        type=float,
-        default=3e-4,
-        help="AdamW learning rate.",
-    )
-    parser.add_argument(
-        "--max-grad-norm",
-        type=float,
-        default=1.0,
-        help="Gradient clipping max norm.",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=0,
-        help="Seed for model initialization and synthetic data generation.",
-    )
-    parser.add_argument(
-        "--compile",
-        action="store_true",
-        help="Compile the placed module with the AutoParallel backend before training.",
-    )
-    parser.add_argument(
-        "--verbose",
-        action="store_true",
-        help="Print the full AutoParallel optimizer log.",
-    )
-    return parser.parse_args()
-
-
-def make_model_config(flavor: str, seq_len: int) -> Qwen3Model.Config:
-    config = qwen3_configs[flavor](attn_backend="sdpa")
-    config.rope = dataclasses.replace(config.rope, max_seq_len=seq_len)
-    return config
-
-
-def init_distributed(args):
-    if "WORLD_SIZE" not in os.environ or "LOCAL_RANK" not in os.environ:
-        raise RuntimeError(
-            "Run this example with torchrun, e.g. "
-            "torchrun --standalone --nproc-per-node 4 "
-            "examples/example_torchtitan_qwen3_dense.py"
-        )
-
-    world_size = int(os.environ["WORLD_SIZE"])
-    local_rank = int(os.environ["LOCAL_RANK"])
-    expected_world_size = args.dp_degree * args.tp_degree
-    if world_size != expected_world_size:
-        raise ValueError(
-            f"WORLD_SIZE ({world_size}) must equal dp-degree * tp-degree "
-            f"({args.dp_degree} * {args.tp_degree} = {expected_world_size})."
-        )
-    if args.global_batch_size % args.dp_degree != 0:
-        raise ValueError(
-            f"global-batch-size ({args.global_batch_size}) must be divisible by "
-            f"dp-degree ({args.dp_degree})."
-        )
-    local_batch_size = args.global_batch_size // args.dp_degree
-    if local_batch_size % args.microbatch_size != 0:
-        raise ValueError(
-            f"local batch size ({local_batch_size}) must be divisible by "
-            f"microbatch-size ({args.microbatch_size})."
-        )
-
-    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
-    dist.init_process_group("nccl", device_id=device)
-    mesh = torch.distributed.device_mesh.init_device_mesh(
-        "cuda",
-        (args.dp_degree, args.tp_degree),
-        mesh_dim_names=("dp", "tp"),
-    )
-    return device, mesh
-
-
-def make_local_tokens(args, mesh, device, vocab_size: int) -> torch.Tensor:
-    coordinate = mesh.get_coordinate()
-    if coordinate is None:
-        raise RuntimeError("DeviceMesh coordinate is unavailable on this rank.")
-    dp_rank, _tp_rank = coordinate
-    local_batch_size = args.global_batch_size // args.dp_degree
-
-    generator = torch.Generator(device="cpu")
-    generator.manual_seed(args.seed)
-    tokens = torch.randint(
-        0,
-        vocab_size,
-        (args.global_batch_size, args.seq_len + 1),
-        generator=generator,
-        dtype=torch.long,
-    )
-
-    start = dp_rank * local_batch_size
-    stop = start + local_batch_size
-    return tokens[start:stop].to(device, non_blocking=True)
-
-
-def vocab_parallel_cross_entropy(
-    logits: torch.Tensor,
-    labels: torch.Tensor,
-    *,
-    vocab_size: int,
-    tp_group,
-    tp_rank: int,
-    tp_degree: int,
-    global_token_count: int,
-) -> torch.Tensor:
-    if logits.shape[:2] != labels.shape:
-        raise ValueError(
-            f"logits shape {tuple(logits.shape)} is incompatible with "
-            f"labels shape {tuple(labels.shape)}."
-        )
-
-    local_vocab_size = logits.shape[-1]
-    vocab_start = tp_rank * local_vocab_size
-    vocab_stop = vocab_start + local_vocab_size
-    if tp_rank == tp_degree - 1:
-        vocab_stop = vocab_size
-
-    logits = logits.float()
-    local_max = logits.amax(dim=-1)
-    with torch.no_grad():
-        global_max = local_max.detach().clone()
-        dist.all_reduce(global_max, op=dist.ReduceOp.MAX, group=tp_group)
-
-    shifted_logits = logits - global_max.unsqueeze(-1)
-    local_exp_sum = shifted_logits.exp().sum(dim=-1)
-    global_exp_sum = dist_nn_func.all_reduce(
-        local_exp_sum,
-        op=dist.ReduceOp.SUM,
-        group=tp_group,
-    )
-
-    target_mask = (labels >= vocab_start) & (labels < vocab_stop)
-    local_target = torch.zeros_like(labels, dtype=torch.long)
-    local_target[target_mask] = labels[target_mask] - vocab_start
-    local_target_logits = logits.gather(-1, local_target.unsqueeze(-1)).squeeze(-1)
-    local_target_logits = local_target_logits * target_mask.to(logits.dtype)
-    target_logits = dist_nn_func.all_reduce(
-        local_target_logits,
-        op=dist.ReduceOp.SUM,
-        group=tp_group,
-    )
-
-    loss_sum = (global_exp_sum.log() + global_max - target_logits).sum()
-    return loss_sum / (global_token_count * tp_degree)
-
-
-def print_rank0(message: str) -> None:
-    if dist.get_rank() == 0:
-        print(message, flush=True)
-
-
-def main():
-    args = parse_args()
-    logging.basicConfig(level=logging.DEBUG)
-
-    device, mesh = init_distributed(args)
-    tp_group = mesh.get_group("tp")
-    tp_rank = mesh.get_local_rank("tp")
-    local_batch_size = args.global_batch_size // args.dp_degree
-    gradient_accumulation_steps = local_batch_size // args.microbatch_size
-
-    torch.manual_seed(args.seed)
-    model_config = make_model_config(args.flavor, args.seq_len)
-    vocab_size = model_config.vocab_size
-
-    with torch.device("meta"):
-        model = model_config.build()
-
-    def input_fn():
-        return torch.randint(
-            0,
-            vocab_size,
-            (args.global_batch_size, args.seq_len),
-            device=device,
-        )
-
-    mp_policy = MixedPrecisionPolicy(
-        param_dtype=torch.bfloat16,
-        reduce_dtype=torch.float32,
-    )
-
-    print_rank0(
-        f"torchtitan Qwen3 {args.flavor} via AutoParallel: "
-        f"mesh=(dp={args.dp_degree}, tp={args.tp_degree}), "
-        f"global_batch={args.global_batch_size}, "
-        f"local_batch={local_batch_size}, "
-        f"microbatch={args.microbatch_size}, "
-        f"grad_accum={gradient_accumulation_steps}, "
-        f"seq_len={args.seq_len}"
-    )
-
-    t0 = time.time()
-    with AutoParallel(
-        model,
-        input_fn,
-        mesh,
-        mp_policy,
-        repeated_subgraphs=True,
-    ) as autop:
-        autop.add_parameter_memory_constraint(low=None, high=None)
-        autop.add_input_constraints([(Shard(0), Replicate())])
-        autop.add_output_constraints([(Shard(0), Shard(2))])
-        sharding_placement = autop.optimize_placement(verbose=args.verbose)
-        parallel_mod = autop.apply_placement(sharding_placement)
-
-    print_rank0(f"Tracing + optimization took {time.time() - t0:.1f}s")
-
-    parallel_mod.to_empty(device=device)
-    torch.manual_seed(args.seed)
-    parallel_mod.init_weights(buffer_device=device)  # type: ignore[operator]
-
-    if args.compile:
-        parallel_mod = torch.compile(parallel_mod, backend=autoparallel_backend())
-
-    batch = make_local_tokens(args, mesh, device, vocab_size)
-    inputs = batch[:, :-1].contiguous()
-    labels = batch[:, 1:].contiguous()
-    input_microbatches = torch.split(inputs, args.microbatch_size, dim=0)
-    label_microbatches = torch.split(labels, args.microbatch_size, dim=0)
-
-    global_token_count = args.global_batch_size * args.seq_len
-    optimizer = torch.optim.AdamW(parallel_mod.parameters(), lr=args.lr)
-
-    try:
-        losses: list[float] = []
-        for step in range(args.train_steps):
-            optimizer.zero_grad(set_to_none=True)
-            step_loss = torch.zeros((), device=device)
-            for micro_inputs, micro_labels in zip(
-                input_microbatches, label_microbatches
-            ):
-                logits = parallel_mod(micro_inputs)
-                if torch.any(torch.isnan(logits)):
-                    raise RuntimeError("Found NaNs in forward output.")
-
-                loss = vocab_parallel_cross_entropy(
-                    logits,
-                    micro_labels,
-                    vocab_size=vocab_size,
-                    tp_group=tp_group,
-                    tp_rank=tp_rank,
-                    tp_degree=args.tp_degree,
-                    global_token_count=global_token_count,
-                )
-                if torch.any(torch.isnan(loss)):
-                    raise RuntimeError("Found NaNs in training loss.")
-
-                loss.backward()
-                step_loss = step_loss + loss.detach()
-
-            torch.nn.utils.clip_grad_norm_(
-                parallel_mod.parameters(), args.max_grad_norm
-            )
-            optimizer.step()
-
-            with torch.no_grad():
-                logged_loss = step_loss.clone()
-                dist.all_reduce(logged_loss, op=dist.ReduceOp.SUM)
-                loss_value = float(logged_loss.item())
-            losses.append(loss_value)
-            print_rank0(f"step={step:03d} loss={loss_value:.6f}")
-
-        if len(losses) > 1 and losses[-1] >= losses[0]:
-            raise RuntimeError(
-                f"Training loss did not improve: "
-                f"initial={losses[0]:.6f}, final={losses[-1]:.6f}"
-            )
-
-        if len(losses) > 1:
-            print_rank0(
-                f"Loss improved: initial={losses[0]:.6f}, final={losses[-1]:.6f}"
-            )
-        else:
-            print_rank0(f"Completed one step: loss={losses[0]:.6f}")
-        dist.barrier(device_ids=[device.index])
-        torch.cuda.synchronize(device)
-    finally:
-        if dist.is_initialized():
-            dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/conftest.py b/tests/conftest.py
index d5d23ea1..22af2357 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -40,6 +40,16 @@ def apply_cuda_patches(func):
     return func
 
 
+@pytest.fixture(autouse=True)
+def _reset_placement_options_cache():
+    """The placement-options cache is a process-global; clear it before each test
+    so optimizer builds never reuse stale strategies from a prior test's model."""
+    from autoparallel.shardings.placement_options import reset_placement_options_cache
+
+    reset_placement_options_cache()
+    yield
+
+
 @pytest.fixture(scope="module", autouse=True)
 def init_pg():
     world_size = 256
diff --git a/tests/test_dsv3_torchtitan_config.py b/tests/test_dsv3_torchtitan_config.py
deleted file mode 100644
index e009206b..00000000
--- a/tests/test_dsv3_torchtitan_config.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-from pathlib import Path
-
-import pytest
-import torch
-
-from autoparallel._testing.models.dsv3 import DeepSeekV3Model
-
-
-def test_dsv3_accepts_torchtitan_grouped_experts_config():
-    torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan"
-    if not torchtitan_root.exists():
-        pytest.skip("torchtitan sibling checkout not found")
-    sys.path.insert(0, str(torchtitan_root))
-
-    try:
-        from torchtitan.models.deepseek_v3 import deepseekv3_configs  # type: ignore[import-not-found]
-    except Exception as exc:
-        pytest.skip(f"torchtitan DeepSeek-V3 config unavailable: {exc}")
-
-    with torch.device("meta"):
-        model = DeepSeekV3Model(
-            deepseekv3_configs["debugmodel"](
-                attn_backend="sdpa",
-                moe_comm_backend="standard",
-            )
-        )
-
-    moe_layer = next(layer for layer in model.layers.values() if layer.moe_enabled)
-    assert moe_layer.moe.experts.use_grouped_mm
diff --git a/tests/test_optimize_placement.py b/tests/test_optimize_placement.py
index 9325f1f5..20ac9d95 100644
--- a/tests/test_optimize_placement.py
+++ b/tests/test_optimize_placement.py
@@ -867,9 +867,7 @@ def test_invalid_strategies_are_pruned(device_mesh_2d):
         assert all(k in opt._valid_keys for k in opt.decision_vars)
 
         # No inf-cost (== 0) constraints should be emitted any more.
-        assert not any(
-            name.startswith("inf_cases") for name in opt.prob.constraints
-        )
+        assert not any(name.startswith("inf_cases") for name in opt.prob.constraints)
 
         # The pruned problem must still solve to a valid solution.
         solution = autop.optimize_placement()
diff --git a/tests/test_qwen3.py b/tests/test_qwen3.py
deleted file mode 100644
index 5b32bc5b..00000000
--- a/tests/test_qwen3.py
+++ /dev/null
@@ -1,323 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-from pathlib import Path
-
-import pytest
-import torch
-from torch.distributed.fsdp import MixedPrecisionPolicy
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import Replicate, Shard
-
-from autoparallel._testing.models.qwen3 import (
-    Qwen3ModelArgs,
-    Transformer,
-    apply_rotary_emb_cos_sin,
-    qwen3_debug_args,
-    qwen3_args_from_torchtitan_config,
-    qwen3_moe_debug_args,
-)
-from autoparallel.api import AutoParallel, auto_parallel
-
-
-def _tiny_args(**overrides) -> Qwen3ModelArgs:
-    args = Qwen3ModelArgs(
-        dim=64,
-        n_layers=2,
-        n_heads=4,
-        n_kv_heads=2,
-        head_dim=16,
-        hidden_dim=128,
-        vocab_size=128,
-        max_seq_len=16,
-    )
-    for key, value in overrides.items():
-        setattr(args, key, value)
-    args.__post_init__()
-    return args
-
-
-def _tiny_moe_args(**overrides) -> Qwen3ModelArgs:
-    args = Qwen3ModelArgs(
-        dim=32,
-        n_layers=1,
-        n_heads=4,
-        n_kv_heads=2,
-        head_dim=8,
-        hidden_dim=64,
-        vocab_size=64,
-        max_seq_len=4,
-        moe_enabled=True,
-        moe_hidden_dim=16,
-        num_experts=64,
-        top_k=8,
-        route_norm=True,
-        score_before_experts=False,
-        moe_axis_name="tp",
-    )
-    for key, value in overrides.items():
-        setattr(args, key, value)
-    args.__post_init__()
-    return args
-
-
-def test_qwen3_forward_shape():
-    args = _tiny_args()
-    model = Transformer(args)
-    model.init_weights(seed=0)
-
-    tokens = torch.randint(0, args.vocab_size, (2, args.max_seq_len))
-    logits = model(tokens)
-
-    assert logits.shape == (2, args.max_seq_len, args.vocab_size)
-
-
-def test_qwen3_qk_norm_changes_logits():
-    args = _tiny_args(n_layers=1)
-    model = Transformer(args)
-    model.init_weights(seed=0)
-
-    tokens = torch.randint(0, args.vocab_size, (2, args.max_seq_len))
-    logits = model(tokens)
-
-    with torch.no_grad():
-        model.layers["0"].attention.q_norm.weight.zero_()
-    logits_without_q = model(tokens)
-
-    assert not torch.allclose(logits, logits_without_q)
-
-
-def test_qwen3_weight_tying_survives_init_weights():
-    args = _tiny_args(enable_weight_tying=True)
-    model = Transformer(args)
-
-    assert model.tok_embeddings.weight is model.lm_head.weight
-    model.init_weights(seed=0)
-    assert model.tok_embeddings.weight is model.lm_head.weight
-
-
-def test_qwen3_debug_args_matches_torchtitan_dense_shape():
-    args = qwen3_debug_args(max_seq_len=32)
-
-    assert args.dim == 256
-    assert args.n_layers == 8
-    assert args.n_heads == 16
-    assert args.n_kv_heads == 8
-    assert args.head_dim == 128
-    assert args.hidden_dim == 3072
-    assert args.vocab_size == 2048
-    assert args.rope_theta == 1000000.0
-    assert args.enable_weight_tying
-
-
-def test_qwen3_moe_debug_args_matches_torchtitan_shape():
-    args = qwen3_moe_debug_args(max_seq_len=32)
-
-    assert args.dim == 256
-    assert args.n_layers == 8
-    assert args.n_heads == 16
-    assert args.n_kv_heads == 8
-    assert args.head_dim == 128
-    assert args.moe_enabled
-    assert args.moe_hidden_dim == 768
-    assert args.num_experts == 64
-    assert args.top_k == 8
-    assert args.route_norm
-    assert not args.score_before_experts
-
-
-@pytest.mark.parametrize(
-    ("flavor", "expected"),
-    [
-        (
-            "8B",
-            {
-                "dim": 4096,
-                "n_layers": 36,
-                "n_heads": 32,
-                "n_kv_heads": 8,
-                "head_dim": 128,
-                "hidden_dim": 12288,
-                "vocab_size": 151936,
-                "moe_enabled": False,
-                "num_experts": 0,
-                "top_k": 1,
-                "max_seq_len": 4096,
-            },
-        ),
-        (
-            "30B-A3B",
-            {
-                "dim": 2048,
-                "n_layers": 48,
-                "n_heads": 32,
-                "n_kv_heads": 4,
-                "head_dim": 128,
-                "hidden_dim": 0,
-                "vocab_size": 151936,
-                "moe_enabled": True,
-                "moe_hidden_dim": 768,
-                "num_experts": 128,
-                "top_k": 8,
-                "route_norm": True,
-                "score_before_experts": False,
-                "max_seq_len": 262144,
-            },
-        ),
-    ],
-)
-def test_qwen3_args_from_torchtitan_config(flavor, expected):
-    torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan"
-    if not torchtitan_root.exists():
-        pytest.skip("torchtitan sibling checkout not found")
-    sys.path.insert(0, str(torchtitan_root))
-
-    try:
-        from torchtitan.models.qwen3 import qwen3_configs  # type: ignore[import-not-found]
-    except Exception as exc:
-        pytest.skip(f"torchtitan Qwen3 config unavailable: {exc}")
-
-    args = qwen3_args_from_torchtitan_config(
-        qwen3_configs[flavor](attn_backend="sdpa")
-    )
-
-    for attr, value in expected.items():
-        assert getattr(args, attr) == value
-    assert args.rope_theta == 1000000.0
-    assert args.norm_eps == 1e-6
-
-
-def test_qwen3_cos_sin_rope_matches_torchtitan_helper():
-    torchtitan_root = Path(__file__).resolve().parents[2] / "torchtitan"
-    if not torchtitan_root.exists():
-        pytest.skip("torchtitan sibling checkout not found")
-    sys.path.insert(0, str(torchtitan_root))
-
-    try:
-        from torchtitan.models.common.rope import (  # type: ignore[import-not-found]
-            RoPE,
-            apply_rotary_emb_cos_sin as tt_apply_rotary_emb_cos_sin,
-        )
-    except Exception as exc:
-        pytest.skip(f"torchtitan Qwen3 RoPE helper unavailable: {exc}")
-
-    args = _tiny_args()
-    rope = RoPE(
-        RoPE.Config(
-            dim=args.head_dim,
-            max_seq_len=args.max_seq_len,
-            theta=args.rope_theta,
-            backend="cos_sin",
-        )
-    )
-    xq = torch.randn(2, args.max_seq_len, args.n_heads, args.head_dim)
-    xk = torch.randn(2, args.max_seq_len, args.n_kv_heads, args.head_dim)
-
-    actual = apply_rotary_emb_cos_sin(xq, xk, rope.cache)
-    expected = tt_apply_rotary_emb_cos_sin(xq, xk, rope.cache)
-
-    torch.testing.assert_close(actual[0], expected[0])
-    torch.testing.assert_close(actual[1], expected[1])
-
-
-def test_qwen3_autoparallel_pipeline_smoke(device_mesh_2d):
-    args = _tiny_args(n_layers=2, max_seq_len=8)
-    batch_size = 2 * device_mesh_2d.shape[0]
-
-    with torch.device("meta"):
-        model = Transformer(args)
-
-    def input_fn():
-        return torch.randint(
-            0,
-            args.vocab_size,
-            (batch_size, args.max_seq_len),
-            device="cuda",
-        )
-
-    mp_policy = MixedPrecisionPolicy(
-        param_dtype=torch.bfloat16,
-        reduce_dtype=torch.float32,
-    )
-
-    with AutoParallel(
-        model,
-        input_fn,
-        device_mesh_2d,
-        mp_policy,
-        repeated_subgraphs=True,
-    ) as autop:
-        autop.add_input_constraints([(Shard(0), Replicate())])
-        autop.add_output_constraints([(Shard(0), Shard(2))])
-        sharding_placement = autop.optimize_placement(verbose=False)
-        parallel_mod = autop.apply_placement(sharding_placement)
-
-    assert isinstance(parallel_mod, Transformer)
-
-
-def test_qwen3_moe_auto_parallel_smoke(device_mesh_2d):
-    args = _tiny_moe_args()
-    local_batch_size = 1
-
-    with torch.device("meta"):
-        model = Transformer(args, mesh=device_mesh_2d, moe_axis_name="tp")
-
-    expected_param_shapes = {
-        name: tuple(param.shape) for name, param in model.named_parameters()
-    }
-    expected_nparams = sum(param.numel() for param in model.parameters())
-
-    tokens = DTensor.from_local(
-        torch.randint(
-            0,
-            args.vocab_size,
-            (local_batch_size, args.max_seq_len),
-            device="cuda",
-        ),
-        device_mesh_2d,
-        [Shard(0), Shard(0)],
-    )
-
-    mp_policy = MixedPrecisionPolicy(
-        param_dtype=torch.bfloat16,
-        reduce_dtype=torch.float32,
-    )
-    parallel_mod = auto_parallel(
-        model,
-        device_mesh_2d,
-        sample_inputs=(tokens,),
-        out_shardings=(Shard(0), Shard(2)),
-        mp_policy=mp_policy,
-        dynamic=True,
-    )
-
-    assert isinstance(parallel_mod, Transformer)
-    assert sum(param.numel() for param in parallel_mod.parameters()) == expected_nparams
-    assert {
-        name: tuple(param.shape) for name, param in parallel_mod.named_parameters()
-    } == expected_param_shapes
-    assert parallel_mod.layers["0"].moe.experts.w1.shape == (
-        args.num_experts,
-        args.moe_hidden_dim,
-        args.dim,
-    )
-
-    parallel_mod.to_empty(device="cuda")
-    parallel_mod.init_weights(buffer_device=torch.device("cuda"), seed=0)
-
-    local_tokens = torch.randint(
-        0,
-        args.vocab_size,
-        (local_batch_size, args.max_seq_len),
-        device="cuda",
-    )
-    out = parallel_mod(local_tokens)
-    assert out.shape == (
-        local_batch_size * device_mesh_2d.shape[1],
-        args.max_seq_len,
-        args.vocab_size // device_mesh_2d.shape[1],
-    )
-    out.backward(torch.randn_like(out))

From 8eda68d266fa7667d0fe432774ac7c78324a6d6b Mon Sep 17 00:00:00 2001
From: Kaijian Wang <wangkj@meta.com>
Date: Mon, 8 Jun 2026 12:37:07 -0700
Subject: [PATCH 27/27] Fall back to serial build when CUDA is initialized (fix
 fork crash)

The parallel decision-var cost build forks workers, but forking a process that
has already initialized CUDA crashes them with "Cannot re-initialize CUDA in
forked subprocess" once they touch the NCCL cost model. Real-GPU runs (example
scripts, torchrun) and CUDA-touching tests hit this. Skip the fork and use the
byte-identical serial path whenever torch.cuda.is_initialized().

Authored with Claude.
---
 autoparallel/optimize_sharding.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
index b14cb142..c3832c87 100644
--- a/autoparallel/optimize_sharding.py
+++ b/autoparallel/optimize_sharding.py
@@ -1127,7 +1127,16 @@ def _compute_node_edge_costs(self, root_idxs):
         global _FORK_OPT
         _FORK_OPT = self
         try:
-            if _PARALLEL_BUILD_WORKERS <= 1 or len(root_idxs) < 64:
+            # Forking a process that has already initialized CUDA crashes the
+            # workers ("Cannot re-initialize CUDA in forked subprocess") once they
+            # touch the NCCL cost model. Real-GPU runs (examples, torchrun) and
+            # any test that has touched CUDA hit this, so fall back to the
+            # (byte-identical) serial path whenever CUDA is live.
+            if (
+                _PARALLEL_BUILD_WORKERS <= 1
+                or len(root_idxs) < 64
+                or torch.cuda.is_initialized()
+            ):
                 return [_par_node_edge_costs(ni) for ni in root_idxs]
             import multiprocessing as mp